├── .gitignore
├── CUDA
    ├── AOSandSOA
    │   ├── AoS.cu
    │   └── SoA.cu
    ├── BFS
    │   └── bfs.cu
    ├── ConstantMemory
    │   └── constantStencil.cu
    ├── Instruction
    │   ├── atomic-ordering.cu
    │   ├── floating-point-accuracy.cu
    │   ├── floating-point-perf.cu
    │   ├── fmad.cu
    │   ├── intrinsic-standard-comp.cu
    │   └── my-atomic-add.cu
    ├── MatrixTranspose
    │   └── transpose.cu
    ├── NVIDIA_Online_Training
    │   ├── Fundamentals_of_CUDA_C_C++
    │   │   ├── 01_hello-gpu.cu
    │   │   ├── 02_first-parallel.cu
    │   │   ├── 03_thread-and-block-idx.cu
    │   │   ├── 04_single-block-loop.cu
    │   │   ├── 05_multiple-block-loop.cu
    │   │   ├── 06_double-elements.cu
    │   │   ├── 07_grid-stride-double.cu
    │   │   ├── 08_add-error-handling.cu
    │   │   ├── 09_vector-add.cu
    │   │   ├── 10_matrix-multiply-2d.cu
    │   │   ├── 11_get-device-properties.cu
    │   │   ├── 12_page-faults.cu
    │   │   ├── 13_print-numbers.cu
    │   │   ├── 14_n-body.cu
    │   │   ├── 15_vector-add-manual-alloc.cu
    │   │   └── 16_vector-add-overlap-xfer.cu
    │   └── Scaling_Workloads_Across_Multiple_GPUs_with_CUDA_C++
    │   │   └── mgpu_with_streams.cu
    ├── SharedMemory
    │   ├── reduceInteger.cu
    │   ├── smemRectangular.cu
    │   ├── smemSquare.cu
    │   └── transposeRectangular.cu
    ├── StreamsAndEvents
    │   ├── asyncAPI.cu
    │   ├── simpleCallback.cu
    │   ├── simpleHyperQBreadth.cu
    │   ├── simpleHyperQDependece.cu
    │   ├── simpleHyperQDepth.cu
    │   ├── simpleMultiAddBreadth.cu
    │   └── simpleMultiAddDepth.cu
    ├── UnifiedMemory
    │   ├── matrixAddWithUnifiedMemory.cu
    │   └── matrixAddWithoutUnifiedMemory.cu
    ├── WarpShuffle
    │   └── simpleShuffle.cu
    ├── bezierCurves
    │   ├── bezierCurves.cuh
    │   ├── bezierCurves1.cu
    │   └── bezierCurves2.cu
    ├── common
    │   ├── common.h
    │   └── common_string.h
    ├── convolution1D
    │   └── conv1D.cu
    ├── convolution2D
    │   └── conv2D.cu
    ├── deviceQuery
    │   ├── deviceQuery.cu
    │   └── simpleDeviceQuery.cu
    ├── histogram
    │   └── histogram.cu
    ├── imageProcessing
    │   ├── convertColorToGrey.cu
    │   ├── imageBlur.cu
    │   └── lena.jpg
    ├── matrixAdd
    │   ├── matrixAdd.cu
    │   └── matrixAdd2.cu
    ├── matrixMul
    │   └── matrixMul.cu
    ├── matrixMulTiling
    │   └── matrixMulTiling.cu
    ├── mergeSort
    │   └── mergeOperation.cu
    ├── prefixSum
    │   └── prefixSum.cu
    ├── reduction
    │   ├── nestedReduce.cu
    │   ├── reduceInteger.cu
    │   ├── reduction.cpp
    │   ├── reduction.h
    │   └── reductionKernel.cu
    ├── simpleDivergence
    │   └── simpleDivergence.cu
    ├── sparseMatrixVectorMul
    │   └── SpMV.cu
    └── vectorAdd
    │   ├── sumArrayZerocopy.cu
    │   └── vectorAdd.cu
├── OpenMP
    ├── 00_omp_hello.c
    ├── 01_omp_hello_errchk.c
    ├── 02_omp_trap1.c
    ├── 03_omp_trap2.c
    ├── 04_omp_trap3.c
    ├── 05_omp_trap4.c
    ├── 06_omp_fibo.c
    ├── 07_omp_pi.c
    ├── 08_omp_odd_even1.c
    ├── 09_omp_odd_even2.c
    ├── 10_omp_sin_sum.c
    ├── 11_omp_msg.c
    ├── 12_omp_mat_vec_mul.c
    ├── 13_omp_private.c
    ├── 14_omp_mat_mul.c
    └── queue
    │   ├── queue.c
    │   └── queue.h
├── cblas_mat_mul.c
├── cublas_mat_mul.cu
├── cuda_mat_mul.cu
├── mkl_mat_mul.c
├── mpi
    ├── 00_mpi_hello.c
    ├── 01_serial_trap.c
    ├── 02_mpi_trap1.c
    ├── 03_mpi_output.c
    ├── 04_mpi_trap2.c
    ├── 05_mpi_trap3.c
    ├── 06_mpi_trap4.c
    ├── 07_mpi_vec_add.c
    ├── 08_mpi_mat_vec_mul.c
    ├── 09_serial_mat_vec_mul_time.c
    ├── 10_mpi_mat_vec_mul_time.c
    ├── 11_serial_odd_even_sort.c
    ├── 12_mpi_odd_even_sort_unsafe.c
    ├── 13_mpi_odd_even_sort_safe.c
    └── 14_mpi_mat_mul.c
├── ocv_mat_mul.c
└── pthread
    ├── 00_pth_hello.c
    ├── 01_pth_mat_vec_mul.c
    ├── 02_pth_pi.c
    ├── 03_pth_pi_busy1.c
    ├── 04_pth_pi_busy2.c
    ├── 05_pth_pi_mutex.c
    ├── 06_pth_message.c
    ├── 07_pth_message_sem.c
    ├── 08_pth_busy_barrier.c
    ├── 09_pth_sem_barrier.c
    ├── 10_pth_cond_barrier.c
    ├── 11_pth_posix_barrier.c
    ├── 12_pth_tokenize.c
    └── 13_pth_mat_mul.c


/.gitignore:
--------------------------------------------------------------------------------
 1 | *
 2 | !*.*
 3 | !*/
 4 | 
 5 | .vscode/
 6 | 
 7 | *.exe
 8 | *.exp
 9 | *.pdb
10 | *.ncu*
11 | *.lib


--------------------------------------------------------------------------------
/CUDA/AOSandSOA/AoS.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        AoS.cu
  3 |  * Description: This is a simple example of using an array of structures to
  4 |  *              store data on the device.
  5 |  *              
  6 |  * Compile:     nvcc -O3 -o AoS AoS.cu -I..
  7 |  * Run:         ./AoS [n]
  8 |  *                  [n] : the number of threads in a block
  9 |  *****************************************************************************/
 10 | 
 11 | #include <stdio.h>
 12 | #include <cuda_runtime.h>
 13 | #include "common/common.h"
 14 | 
 15 | #define LEN 1 << 20
 16 | 
 17 | struct innerStruct {
 18 |     float x;
 19 |     float y;
 20 | };
 21 | 
 22 | void initialInnerStruct(innerStruct* in, const int N)
 23 | {
 24 |     for (int i = 0; i < N; i++) {
 25 |         in[i].x = (rand() & 0xFF) / 100.f;
 26 |         in[i].y = (rand() & 0xFF) / 100.f;
 27 |     }
 28 | }
 29 | 
 30 | void testInnerStructHost(innerStruct* data, innerStruct* result, const int N)
 31 | {
 32 |     for (int i = 0; i < N; i++) {
 33 |         result[i].x = data[i].x + 10.f;
 34 |         result[i].y = data[i].y + 20.f;
 35 |     }
 36 | }
 37 | 
 38 | void checkInnerStruct(innerStruct* hostRef, innerStruct* gpuRef, const int N)
 39 | {
 40 |     double epsilon = 1.0e-8;
 41 |     
 42 |     for (int i = 0; i < N; i++) {
 43 |         if (abs(hostRef[i].x - gpuRef[i].x) > epsilon) {
 44 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef[i].x, gpuRef[i].x);
 45 |             printf("Arrays do not match.\n\n");
 46 | 
 47 |             break;
 48 |         }
 49 |         if (abs(hostRef[i].y - gpuRef[i].y) > epsilon) {
 50 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef[i].y, gpuRef[i].y);
 51 |             printf("Arrays do not match.\n\n");
 52 | 
 53 |             break;
 54 |         }
 55 |     }
 56 | }
 57 | 
 58 | __global__
 59 | void testInnerStruct(innerStruct* data, innerStruct* result, const int N)
 60 | {
 61 |     unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
 62 | 
 63 |     if (idx < N) {
 64 |         innerStruct tmp = data[idx];
 65 |         tmp.x += 10.f;
 66 |         tmp.y += 20.f;
 67 |         result[idx] = tmp;
 68 |     }
 69 | }
 70 | 
 71 | __global__
 72 | void warmup(innerStruct* data, innerStruct* result, const int N)
 73 | {
 74 |     unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
 75 | 
 76 |     if (idx < N) {
 77 |         innerStruct tmp = data[idx];
 78 |         tmp.x += 10.f;
 79 |         tmp.y += 20.f;
 80 |         result[idx] = tmp;
 81 |     }
 82 | }
 83 | 
 84 | int main(int argc, char** argv)
 85 | {
 86 |     // setup device
 87 |     int dev = 0;
 88 |     cudaDeviceProp deviceProp;
 89 |     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 90 |     printf("Test struct of array at device %d: %s\n", dev, deviceProp.name);
 91 |     CUDA_CHECK(cudaSetDevice(dev));
 92 | 
 93 |     // allocate host memory
 94 |     int nElem = LEN;
 95 |     size_t nBytes = nElem * sizeof(innerStruct);
 96 |     innerStruct *h_A = (innerStruct*)malloc(nBytes);
 97 |     innerStruct *hostRef = (innerStruct*)malloc(nBytes);
 98 |     innerStruct *gpuRef = (innerStruct*)malloc(nBytes);
 99 | 
100 |     // initialize host array
101 |     initialInnerStruct(h_A, nElem);
102 |     testInnerStructHost(h_A, hostRef, nElem);
103 | 
104 |     // allocate device memory
105 |     innerStruct* d_A, *d_C;
106 |     CUDA_CHECK(cudaMalloc((void**)&d_A, nBytes));
107 |     CUDA_CHECK(cudaMalloc((void**)&d_C, nBytes));
108 | 
109 |     // copy data from host to device
110 |     CUDA_CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
111 | 
112 |     // setup execution configuration
113 |     int threads = 128;
114 |     if (argc > 1)
115 |         threads = atoi(argv[1]);
116 |     
117 |     dim3 blocks(threads, 1);
118 |     dim3 grids((blocks.x + nElem - 1) / blocks.x, 1);
119 | 
120 |     double start, finish;
121 |     // kernel 1: warmup
122 |     GET_TIME(start);
123 |     warmup<<<grids, blocks>>>(d_A, d_C, nElem);
124 |     cudaDeviceSynchronize();
125 |     GET_TIME(finish);
126 |     //printf("warpup      <<< %3d, %3d >>> elapsed %f sec\n", grids.x, blocks.x, finish-start);
127 |     CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
128 |     checkInnerStruct(hostRef, gpuRef, nElem);
129 | 
130 |     // kernel 2: testInnerStruct
131 |     GET_TIME(start);
132 |     testInnerStruct<<<grids, blocks>>>(d_A, d_C, nElem);
133 |     CUDA_CHECK(cudaDeviceSynchronize());
134 |     GET_TIME(finish);
135 |     printf("innerstruct <<< %3d, %3d >>> elapsed %f sec\n", grids.x, blocks.x, finish-start);
136 |     CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
137 |     checkInnerStruct(hostRef, gpuRef, nElem);
138 | 
139 |     // free memories bost host and device
140 |     CUDA_CHECK(cudaFree(d_A));
141 |     CUDA_CHECK(cudaFree(d_C));
142 |     free(h_A);
143 |     free(hostRef);
144 |     free(gpuRef);
145 | 
146 |     CUDA_CHECK(cudaDeviceReset());
147 |     return 0;
148 | }


--------------------------------------------------------------------------------
/CUDA/AOSandSOA/SoA.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        SoA.cu
  3 |  * Description: This is a simple example of using a structure of arrays to
  4 |  *              store data on the device.
  5 |  *              
  6 |  * Compile:     nvcc -O3 -o SoA SoA.cu -I..
  7 |  * Run:         ./SoA [n]
  8 |  *                  [n] : the number of threads in a block
  9 |  *****************************************************************************/
 10 | 
 11 | #include <stdio.h>
 12 | #include <cuda_runtime.h>
 13 | #include "common/common.h"
 14 | 
 15 | #define LEN 1 << 20
 16 | 
 17 | struct innerArray {
 18 |     float x[LEN];
 19 |     float y[LEN];
 20 | };
 21 | 
 22 | void initialInnerArray(innerArray* in, const int N)
 23 | {
 24 |     for (int i = 0; i < N; i++) {
 25 |         in->x[i] = (rand() & 0xFF) / 100.f;
 26 |         in->y[i] = (rand() & 0xFF) / 100.f;
 27 |     }
 28 | }
 29 | 
 30 | void testInnerArrayHost(innerArray* data, innerArray* result, const int N)
 31 | {
 32 |     for (int i = 0; i < N; i++) {
 33 |         result->x[i] = data->x[i] + 10.f;
 34 |         result->y[i] = data->y[i] + 20.f;
 35 |     }
 36 | }
 37 | 
 38 | void checkInnerArray(innerArray* hostRef, innerArray* gpuRef, const int N)
 39 | {
 40 |     double epsilon = 1.0e-8;
 41 |     
 42 |     for (int i = 0; i < N; i++) {
 43 |         if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon) {
 44 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef->x[i], gpuRef->x[i]);
 45 |             printf("Arrays do not match.\n\n");
 46 | 
 47 |             break;
 48 |         }
 49 |         if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon) {
 50 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef->y[i], gpuRef->y[i]);
 51 |             printf("Arrays do not match.\n\n");
 52 | 
 53 |             break;
 54 |         }
 55 |     }
 56 | }
 57 | 
 58 | __global__
 59 | void testInnerArray(innerArray* data, innerArray* result, const int N)
 60 | {
 61 |     unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
 62 | 
 63 |     if (idx < N) {
 64 |         float tmpX = data->x[idx];
 65 |         float tmpY = data->y[idx];
 66 | 
 67 |         tmpX += 10.f;
 68 |         tmpY += 20.f;
 69 |         result->x[idx] = tmpX;
 70 |         result->y[idx] = tmpY;
 71 |     }
 72 | }
 73 | 
 74 | __global__
 75 | void warmup(innerArray* data, innerArray* result, const int N)
 76 | {
 77 |     unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
 78 | 
 79 |     if (idx < N) {
 80 |         float tmpX = data->x[idx];
 81 |         float tmpY = data->y[idx];
 82 | 
 83 |         tmpX += 10.f;
 84 |         tmpY += 20.f;
 85 |         result->x[idx] = tmpX;
 86 |         result->y[idx] = tmpY;
 87 |     }
 88 | }
 89 | 
 90 | int main(int argc, char** argv)
 91 | {
 92 |     // setup device
 93 |     int dev = 0;
 94 |     cudaDeviceProp deviceProp;
 95 |     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 96 |     printf("Test struct of array at device %d: %s\n", dev, deviceProp.name);
 97 |     CUDA_CHECK(cudaSetDevice(dev));
 98 | 
 99 |     // allocate host memory
100 |     int nElem = LEN;
101 |     size_t nBytes = sizeof(innerArray);
102 |     innerArray *h_A = (innerArray*)malloc(nBytes);
103 |     innerArray *hostRef = (innerArray*)malloc(nBytes);
104 |     innerArray *gpuRef = (innerArray*)malloc(nBytes);
105 | 
106 |     // initialize host array
107 |     initialInnerArray(h_A, nElem);
108 |     testInnerArrayHost(h_A, hostRef, nElem);
109 | 
110 |     // allocate device memory
111 |     innerArray* d_A, *d_C;
112 |     CUDA_CHECK(cudaMalloc((void**)&d_A, nBytes));
113 |     CUDA_CHECK(cudaMalloc((void**)&d_C, nBytes));
114 | 
115 |     // copy data from host to device
116 |     CUDA_CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
117 | 
118 |     // setup execution configuration
119 |     int threads = 128;
120 |     if (argc > 1)
121 |         threads = atoi(argv[1]);
122 |     
123 |     dim3 blocks(threads, 1);
124 |     dim3 grids((blocks.x + nElem - 1) / blocks.x, 1);
125 | 
126 |     double start, finish;
127 |     // kernel 1: warmup
128 |     GET_TIME(start);
129 |     warmup<<<grids, blocks>>>(d_A, d_C, nElem);
130 |     cudaDeviceSynchronize();
131 |     GET_TIME(finish);
132 |     //printf("warpup      <<< %3d, %3d >>> elapsed %f sec\n", grids.x, blocks.x, finish-start);
133 |     CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
134 |     checkInnerArray(hostRef, gpuRef, nElem);
135 | 
136 |     // kernel 2: testInnerArray
137 |     GET_TIME(start);
138 |     testInnerArray<<<grids, blocks>>>(d_A, d_C, nElem);
139 |     CUDA_CHECK(cudaDeviceSynchronize());
140 |     GET_TIME(finish);
141 |     printf("innerarray  <<< %3d, %3d >>> elapsed %f sec\n", grids.x, blocks.x, finish-start);
142 |     CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
143 |     checkInnerArray(hostRef, gpuRef, nElem);
144 | 
145 |     // free memories bost host and device
146 |     CUDA_CHECK(cudaFree(d_A));
147 |     CUDA_CHECK(cudaFree(d_C));
148 |     free(h_A);
149 |     free(hostRef);
150 |     free(gpuRef);
151 | 
152 |     CUDA_CHECK(cudaDeviceReset());
153 |     return 0;
154 | }


--------------------------------------------------------------------------------
/CUDA/Instruction/atomic-ordering.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        atomic-ordering.cu
  3 |  * Description: This is an example to illustrates the difference between using
  4 |  *              atomic operations and using unsafe accesses to increment a shared
  5 |  *              variable.
  6 |  *              In both the atomics() and unsafe() kernels, each thread repeatedly
  7 |  *              increments a globally shared variable by 1. Each thread also stores
  8 |  *              the value is reads from the shared location for the first increment.
  9 |  *              
 10 |  * Compile:     nvcc -o atomic-ordering atomic-ordering.cu -I..
 11 |  * Run:         ./atomic-ordering
 12 |  *****************************************************************************/
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <cuda_runtime.h>
 16 | #include "common/common.h"
 17 | 
 18 | __global__
 19 | void atomics(int* shared_var, int* values_read, int N, int iters)
 20 | {
 21 |     int tid = blockDim.x * blockIdx.x + threadIdx.x;
 22 | 
 23 |     if (tid > N)
 24 |         return;
 25 | 
 26 |     values_read[tid] = atomicAdd(shared_var, 1);
 27 |     
 28 |     for (int i = 0; i < iters; i++)
 29 |         atomicAdd(shared_var, 1);
 30 | }
 31 | 
 32 | __global__
 33 | void unsafe(int* shared_var, int* values_read, int N, int iters)
 34 | {
 35 |     int tid = blockDim.x * blockIdx.x + threadIdx.x;
 36 | 
 37 |     if (tid > N)
 38 |         return;
 39 | 
 40 |     int old = *shared_var;
 41 |     *shared_var = old + 1;
 42 |     values_read[tid] = old;
 43 | 
 44 |     for (int i = 0; i < iters; i++) {
 45 |         int old = *shared_var;
 46 |         *shared_var = old + 1;
 47 |     }
 48 | }
 49 | 
 50 | void print_read_results(int *h_arr, int *d_arr, int N, const char* label)
 51 | {
 52 |     int maxNumToPrint = 10;
 53 |     int nToPrint = N > maxNumToPrint ? maxNumToPrint : N;
 54 | 
 55 |     CUDA_CHECK(cudaMemcpy(h_arr, d_arr, nToPrint * sizeof(int), cudaMemcpyDeviceToHost));
 56 |     printf("Threads performing %s operations read values", label);
 57 | 
 58 |     for (int i = 0; i < nToPrint; i++) {
 59 |         printf(" %d", h_arr[i]);
 60 |     }
 61 |     printf("\n");
 62 | }
 63 | 
 64 | int main(int argc, char** argv)
 65 | {
 66 |     int N = 64;
 67 |     int block = 32;
 68 |     int runs = 30;
 69 |     int iters = 100000;
 70 |     int *d_shared_var;
 71 |     int h_shared_var_atomic, h_shared_var_unsafe;
 72 |     int *d_values_read_atomic, *d_values_read_unsafe;
 73 |     int *h_values_read;
 74 | 
 75 |     CUDA_CHECK(cudaMalloc((void**)&d_shared_var, sizeof(int)));
 76 |     CUDA_CHECK(cudaMalloc((void**)&d_values_read_atomic, N * sizeof(int)));
 77 |     CUDA_CHECK(cudaMalloc((void**)&d_values_read_unsafe, N * sizeof(int)));
 78 |     h_values_read = (int*)malloc(N * sizeof(int));
 79 | 
 80 |     double atomic_mean_time = 0;
 81 |     double unsafe_mean_time = 0;
 82 | 
 83 |     for (int r = 0; r < runs; r++) {
 84 |         double start, stop;
 85 |         GET_TIME(start);
 86 |         CUDA_CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int)));
 87 |         atomics<<<N / block, block>>>(d_shared_var, d_values_read_atomic, N, iters);
 88 |         CUDA_CHECK(cudaDeviceSynchronize());
 89 |         GET_TIME(stop)
 90 |         atomic_mean_time += (stop - start);
 91 |         CUDA_CHECK(cudaMemcpy(&h_shared_var_atomic, d_shared_var, sizeof(int), cudaMemcpyDeviceToHost));
 92 | 
 93 |         GET_TIME(start);
 94 |         CUDA_CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int)));
 95 |         unsafe<<<N / block, block>>>(d_shared_var, d_values_read_unsafe, N, iters);
 96 |         CUDA_CHECK(cudaDeviceSynchronize());
 97 |         GET_TIME(stop);
 98 |         unsafe_mean_time += stop - start;
 99 |         CUDA_CHECK(cudaMemcpy(&h_shared_var_unsafe, d_shared_var, sizeof(int), cudaMemcpyDeviceToHost));
100 |     }
101 | 
102 |     printf("In total, %d runs using atomic operations took %f s\n", runs, atomic_mean_time);
103 |     printf("  Using atomic operations also produced an output of %d\n", h_shared_var_atomic);
104 |     printf("In total, %d runs using unsafe operations took %f s\n", runs, unsafe_mean_time);
105 |     printf("  Using unsafe operations also produced an output of %d\n", h_shared_var_unsafe);
106 | 
107 |     print_read_results(h_values_read, d_values_read_atomic, N, "atomic");
108 |     print_read_results(h_values_read, d_values_read_unsafe, N, "unsafe");
109 | 
110 |     return 0;
111 | }


--------------------------------------------------------------------------------
/CUDA/Instruction/floating-point-accuracy.cu:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        floating-point-accuracy.cu
 3 |  * Description: This is an example to demonstrate floating-point's inability to 
 4 |  *              represent certain values with a specific value as an example.
 5 |  * 
 6 |  *              In this example, the value 12.1 is stored in single- and
 7 |  *              double-precision floating-point variables on both the host and
 8 |  *              device. After retrieving the results from the device, the actual
 9 |  *              values stored are printed to 20 decimal places and the single- and
10 |  *              double-precision results from the host and device are compared to
11 |  *              each other to verify that host and device are equally accurate for
12 |  *              the same type.
13 |  *              
14 |  * Compile:     nvcc -o floating-point-accuracy floating-point-accuracy.cu -I..
15 |  * Run:         ./floating-point-accuracy
16 |  *****************************************************************************/
17 | #include <stdio.h>
18 | #include <stdlib.h>
19 | #include <cuda_runtime.h>
20 | #include "common/common.h"
21 | 
22 | __global__
23 | void kernel(float* f, double* d)
24 | {
25 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
26 | 
27 |     if (tid == 0) {
28 |         *f = 12.1;
29 |         *d = 12.1;
30 |     }
31 | }
32 | 
33 | int main(int argc, char **argv)
34 | {
35 |     float *deviceF;
36 |     float h_deviceF;
37 |     double *deviceD;
38 |     double h_deviceD;
39 | 
40 |     float hostF = 12.1;
41 |     double hostD = 12.1;
42 | 
43 |     CUDA_CHECK(cudaMalloc((void **)&deviceF, sizeof(float)));
44 |     CUDA_CHECK(cudaMalloc((void **)&deviceD, sizeof(double)));
45 |     kernel<<<1, 32>>>(deviceF, deviceD);
46 |     CUDA_CHECK(cudaMemcpy(&h_deviceF, deviceF, sizeof(float),
47 |                      cudaMemcpyDeviceToHost));
48 |     CUDA_CHECK(cudaMemcpy(&h_deviceD, deviceD, sizeof(double),
49 |                      cudaMemcpyDeviceToHost));
50 | 
51 |     printf("Host single-precision representation of 12.1   = %.20f\n", hostF);
52 |     printf("Host double-precision representation of 12.1   = %.20f\n", hostD);
53 |     printf("Device single-precision representation of 12.1 = %.20f\n", hostF);
54 |     printf("Device double-precision representation of 12.1 = %.20f\n", hostD);
55 |     printf("Device and host single-precision representation equal? %s\n",
56 |            hostF == h_deviceF ? "yes" : "no");
57 |     printf("Device and host double-precision representation equal? %s\n",
58 |            hostD == h_deviceD ? "yes" : "no");
59 | 
60 |     return 0;
61 | }


--------------------------------------------------------------------------------
/CUDA/Instruction/fmad.cu:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        fmad.cu
 3 |  * Description: This is an example to illustrates the effect on numerical accuracy
 4 |  *              of fusing a multiply-add into a sing MAD instruction.
 5 |  *              
 6 |  * Compile:     nvcc -o fmad fmad.cu -I.. [--fmad=true or false]
 7 |  * Run:         ./fmad
 8 |  *****************************************************************************/
 9 | #include <stdio.h>
10 | #include <stdlib.h>
11 | #include <cuda_runtime.h>
12 | #include "common/common.h"
13 | 
14 | __global__
15 | void fmad_kernel(double x, double y, double *out)
16 | {
17 |     int tid = blockDim.x * blockIdx.x + threadIdx.x;
18 | 
19 |     if (tid == 0) {
20 |         *out = x * x + y;
21 |     }
22 | }
23 | 
24 | double host_fmad_kernel(double x, double y)
25 | {
26 |     return x * x + y;
27 | }
28 | 
29 | int main(int argc, char** argv)
30 | {
31 |     double *d_out, h_out;
32 |     double x = 2.891903;
33 |     double y = -3.980364;
34 | 
35 |     double host_value = host_fmad_kernel(x, y);
36 | 
37 |     CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)));
38 |     fmad_kernel<<<1, 32>>>(x, y, d_out);
39 |     CUDA_CHECK(cudaMemcpy(&h_out, d_out, sizeof(double), cudaMemcpyDeviceToHost));
40 | 
41 |     if (host_value == h_out) {
42 |         printf("The device output the same value as the host.\n");
43 |     }
44 |     else {
45 |         printf("The device output a different value than the host, diff=%e.\n", fabs(host_value - h_out));
46 |     }
47 | 
48 |     return 0;
49 | }


--------------------------------------------------------------------------------
/CUDA/Instruction/intrinsic-standard-comp.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        intrinsic-standard-comp.cu
  3 |  * Description: This is an example to demonstrate the relative performance and
  4 |  *              accuracy of CUDA standard and intrinsic functions.
  5 |  * 
  6 |  *              The computational kernel of this example is the iterative 
  7 |  *              calculation of a value squared. This computation is done on the
  8 |  *              host, on the device with a standard function. The results from
  9 |  *              all three are compared for numerical accuarcy (with the host as
 10 |  *              the baseline), and the performance of standard and intrinsic 
 11 |  *              function is also compared.
 12 |  *              
 13 |  * Compile:     nvcc -o intrinsic-standard-comp intrinsic-standard-comp.cu -I..
 14 |  * Run:         ./intrinsic-standard-comp
 15 |  *****************************************************************************/
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <cuda_runtime.h>
 19 | #include "common/common.h"
 20 | 
 21 | /* Perform iters power operations using the standard powf function. */
 22 | __global__
 23 | void standard_kernel(float a, float *out, int iters)
 24 | {
 25 |     int tid = blockDim.x * blockIdx.x + threadIdx.x;
 26 | 
 27 |     if (tid == 0) {
 28 |         float tmp;
 29 | 
 30 |         for (int i = 0; i < iters; i++)
 31 |             tmp = powf(a, 2.0f);
 32 |         
 33 |         *out = tmp;
 34 |     }
 35 | }
 36 | 
 37 | /* Perform iters power operations using the intrinsic __powf function. */
 38 | __global__
 39 | void intrinsic_kernel(float a, float *out, int iters)
 40 | {
 41 |     int tid = blockDim.x * blockIdx.x + threadIdx.x;
 42 | 
 43 |     if (tid == 0) {
 44 |         float tmp;
 45 | 
 46 |         for (int i = 0; i < iters; i++)
 47 |             tmp = __powf(a, 2.0f);
 48 |         
 49 |         *out = tmp;
 50 |     }
 51 | }
 52 | 
 53 | int main(int argc, char** argv)
 54 | {
 55 |     int runs = 30;
 56 |     int iters = 1000;
 57 | 
 58 |     float *d_standard_out, h_standard_out;
 59 |     CUDA_CHECK(cudaMalloc((void**)&d_standard_out, sizeof(float)));
 60 | 
 61 |     float *d_intrinsic_out, h_intrinsic_out;
 62 |     CUDA_CHECK(cudaMalloc((void**)&d_intrinsic_out, sizeof(float)));
 63 | 
 64 |     float input_value = 8181.25;
 65 | 
 66 |     double mean_standard_time = 0.0;
 67 |     double mean_intrinsic_time = 0.0;
 68 | 
 69 |     for (int i = 0; i < runs; i++) {
 70 |         double start, stop;
 71 | 
 72 |         GET_TIME(start);
 73 |         standard_kernel<<<1, 32>>>(input_value, d_standard_out, iters);
 74 |         CUDA_CHECK(cudaDeviceSynchronize());
 75 |         GET_TIME(stop);
 76 |         mean_standard_time += stop - start;
 77 |         
 78 |         GET_TIME(start);
 79 |         intrinsic_kernel<<<1, 32>>>(input_value, d_intrinsic_out, iters);
 80 |         CUDA_CHECK(cudaDeviceSynchronize());
 81 |         GET_TIME(stop);
 82 |         mean_intrinsic_time += stop - start;
 83 |     }
 84 | 
 85 |     CUDA_CHECK(cudaMemcpy(&h_standard_out, d_standard_out, sizeof(float), cudaMemcpyDeviceToHost));
 86 |     CUDA_CHECK(cudaMemcpy(&h_intrinsic_out, d_intrinsic_out, sizeof(float), cudaMemcpyDeviceToHost));
 87 |     float host_value = powf(input_value, 2.0f);
 88 | 
 89 |     mean_standard_time /= runs;
 90 |     mean_intrinsic_time /= runs;
 91 | 
 92 |     printf("Host calculated\t\t\t%f\n", host_value);
 93 |     printf("Standard Device calculated\t%f\n", h_standard_out);
 94 |     printf("Intrinsic Device calculated\t%f\n", h_intrinsic_out);
 95 |     printf("Host equals Standard?\t\t%s, diff=%e\n",
 96 |            host_value == h_standard_out ? "Yes" : "No",
 97 |            fabs(host_value - h_standard_out));
 98 |     printf("Host equals Intrinsic?\t\t%s, diff=%e\n",
 99 |            host_value == h_intrinsic_out ? "Yes" : "No",
100 |            fabs(host_value - h_intrinsic_out));
101 |     printf("Standard equals Intrinsic?\t%s, diff=%e\n",
102 |            h_standard_out == h_intrinsic_out ? "Yes" : "No",
103 |            fabs(h_standard_out - h_intrinsic_out));
104 |     printf("\n");
105 |     printf("Mean execution time for standard function powf:    %f ms\n",
106 |            mean_standard_time * 1000.f);
107 |     printf("Mean execution time for intrinsic function __powf: %f ms\n",
108 |            mean_intrinsic_time * 1000.f);
109 | 
110 |     return 0;
111 | }


--------------------------------------------------------------------------------
/CUDA/Instruction/my-atomic-add.cu:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        my-atomic-add.cu
 3 |  * Description: This is an example to illustrates implementation of custom atomic
 4 |  *              operations using CUDA's build-in atomicCAS function to implement
 5 |  *              atomic signed 32-bit integer addition
 6 |  *              
 7 |  * Compile:     nvcc -o my-atomic-add my-atomic-add.cu -I..
 8 |  * Run:         ./my-atomic-add
 9 |  *****************************************************************************/
10 | #include <stdio.h>
11 | #include <stdlib.h>
12 | #include <cuda_runtime.h>
13 | #include "common/common.h"
14 | 
15 | __device__
16 | int myAtomicAdd(int* address, int incr)
17 | {
18 |     // Create an initial guess for the value stored at *address
19 |     int guess = *address;
20 |     int oldValue = atomicCAS(address, guess, guess + incr);
21 | 
22 |     // Loop while the quess is incorrect
23 |     while (oldValue != guess) {
24 |         guess = oldValue;
25 |         oldValue = atomicCAS(address, guess, guess + incr);
26 |     }
27 |     
28 |     return oldValue;
29 | }
30 | 
31 | __global__
32 | void kernel(int *sharedInteger)
33 | {
34 |     myAtomicAdd(sharedInteger, 1);
35 | }
36 | 
37 | int main(int argc, char **argv)
38 | {
39 |     int h_sharedInteger;
40 |     int *d_sharedInteger;
41 |     CUDA_CHECK(cudaMalloc((void **)&d_sharedInteger, sizeof(int)));
42 |     CUDA_CHECK(cudaMemset(d_sharedInteger, 0x00, sizeof(int)));
43 | 
44 |     kernel<<<4, 128>>>(d_sharedInteger);
45 | 
46 |     CUDA_CHECK(cudaMemcpy(&h_sharedInteger, d_sharedInteger, sizeof(int), cudaMemcpyDeviceToHost));
47 |     printf("4 x 128 increments led to value of %d\n", h_sharedInteger);
48 | 
49 |     return 0;
50 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/01_hello-gpu.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | void helloCPU()
 4 | {
 5 |     printf("Hello from the CPU.\n");
 6 | }
 7 | 
 8 | __global__ void helloGPU()
 9 | {
10 |     printf("Hello from the GPU.\n");
11 | }
12 | 
13 | int main()
14 | {
15 |     helloCPU();
16 | 
17 |     helloGPU<<<1, 1>>>();
18 |     cudaDeviceSynchronize();
19 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/02_first-parallel.cu:
--------------------------------------------------------------------------------
 1 | // 02_first-parallel.cu
 2 | #include <stdio.h>
 3 | 
 4 | __global__
 5 | void firstParallel()
 6 | {
 7 |     printf("This is running in parallel.\n");
 8 | }
 9 | 
10 | int main()
11 | {
12 |     firstParallel<<<5, 5>>>();
13 |     cudaDeviceSynchronize();
14 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/03_thread-and-block-idx.cu:
--------------------------------------------------------------------------------
 1 | // 03_thread-and-block-idx.cu
 2 | #include <stdio.h>
 3 | 
 4 | __global__
 5 | void printSuccessForCorrectExecutionConfiguration()
 6 | {
 7 |     if (threadIdx.x == 1023 && blockIdx.x == 255) {
 8 |         printf("Success.\n");
 9 |     }
10 | }
11 | 
12 | int main()
13 | {
14 |     printSuccessForCorrectExecutionConfiguration<<<256, 1024>>>();
15 |     cudaDeviceSynchronize();
16 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/04_single-block-loop.cu:
--------------------------------------------------------------------------------
 1 | // 04_single-block-loop
 2 | #include <stdio.h>
 3 | 
 4 | __global__
 5 | void loop()
 6 | {
 7 |     printf("This is iteration number %d\n", threadIdx.x);
 8 | }
 9 | 
10 | int main()
11 | {
12 |     loop<<<1, 10>>>();
13 |     cudaDeviceSynchronize();
14 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/05_multiple-block-loop.cu:
--------------------------------------------------------------------------------
 1 | // 05_multiple-block-loop
 2 | #include <stdio.h>
 3 | 
 4 | __global__
 5 | void loop()
 6 | {
 7 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     printf("This is iteration number %d\n", idx);
 9 | }
10 | 
11 | int main()
12 | {
13 |     loop<<<2, 5>>>();
14 |     cudaDeviceSynchronize();
15 | }
16 | 
17 | // CPU-only
18 | 
19 | int N = 2<<20;
20 | size_t size = N * sizeof(int);
21 | 
22 | int *a;
23 | a = (int *)malloc(size);
24 | 
25 | // Use `a` in CPU-only program.
26 | 
27 | free(a);
28 | // Accelerated
29 | 
30 | int N = 2<<20;
31 | size_t size = N * sizeof(int);
32 | 
33 | int *a;
34 | // Note the address of `a` is passed as first argument.
35 | cudaMallocManaged(&a, size);
36 | 
37 | // Use `a` on the CPU and/or on any GPU in the accelerated system.
38 | 
39 | cudaFree(a);


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/06_double-elements.cu:
--------------------------------------------------------------------------------
 1 | // 06_double-elements.cu
 2 | #include <stdio.h>
 3 | 
 4 | void init(int *a, const int N)
 5 | {
 6 |     for (int i = 0; i < N; i++) {
 7 |         a[i] = i;
 8 |     }
 9 | }
10 | 
11 | __global__
12 | void doubleElements(int *a, const int N)
13 | {
14 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
15 |     if (idx < N) {
16 |         a[i] *= 2;
17 |     }
18 | }
19 | 
20 | bool checkElementsAreDoubled(int *a, const int N)
21 | {
22 |     for (int i = 0; i < N; i++) {
23 |         if (a[i] != i * 2)
24 |             return false;
25 |     }
26 | 
27 |     return true;
28 | }
29 | 
30 | int main()
31 | {
32 |     int N = 1000;
33 |     int *a;
34 | 
35 |     size_t size = N * sizeof(int);
36 | 
37 |     // Use 'cudaMallocManaged' to allocate pointer 'a' available
38 |     // on both the host and the device.
39 |     cudamallocManaged(&a, size);
40 | 
41 |     init(a, N);
42 | 
43 |     size_t threads_per_block = 256;
44 |     size_t number_of_blocks = (N + threads_per_block - 1) / threads_per_block;
45 | 
46 |     doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
47 |     cudaDeviceSynchronize();
48 |     
49 |     bool areDoubled = checkElementsAreDoubled(a, N);
50 |     printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
51 | 
52 |     // Use 'cudaFree' to free memory allocated with 'cudaMallocManaged'
53 |     cudaFree(a);
54 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/07_grid-stride-double.cu:
--------------------------------------------------------------------------------
 1 | // 07_grid-stride-double.cu
 2 | #include <stdio.h>
 3 | 
 4 | void init(int *a, const int N)
 5 | {
 6 |     for (int i = 0; i < N; i++) {
 7 |         a[i] = i;
 8 |     }
 9 | }
10 | 
11 | __global__
12 | void doubleElements(int *a, const int N)
13 | {
14 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
15 |     int stride = gridDim.x * blockDim.x;
16 |     for (int i = idx; i < N; i += stride) {
17 |         a[i] *= 2;
18 |     }
19 | }
20 | 
21 | bool checkElementsAreDoubled(int *a, const int N)
22 | {
23 |     for (int i = 0; i < N; i++) {
24 |         if (a[i] != i * 2)
25 |             return false;
26 |     }
27 | 
28 |     return true;
29 | }
30 | 
31 | int main()
32 | {
33 |     int N = 1000;
34 |     int *a;
35 | 
36 |     size_t size = N * sizeof(int);
37 | 
38 |     // Use 'cudaMallocManaged' to allocate pointer 'a' available
39 |     // on both the host and the device.
40 |     cudaMallocManaged(&a, size);
41 | 
42 |     init(a, N);
43 | 
44 |     size_t threads_per_block = 256;
45 |     size_t number_of_blocks = (N + threads_per_block - 1) / threads_per_block;
46 | 
47 |     doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
48 |     cudaDeviceSynchronize();
49 |     
50 |     bool areDoubled = checkElementsAreDoubled(a, N);
51 |     printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
52 | 
53 |     // Use 'cudaFree' to free memory allocated with 'cudaMallocManaged'
54 |     cudaFree(a);
55 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/08_add-error-handling.cu:
--------------------------------------------------------------------------------
 1 | // 08_add-error-handling.cu
 2 | #include <stdio.h>
 3 | 
 4 | void init(int *a, const int N)
 5 | {
 6 |     for (int i = 0; i < N; i++) {
 7 |         a[i] = i;
 8 |     }
 9 | }
10 | 
11 | __global__
12 | void doubleElements(int *a, const int N)
13 | {
14 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
15 |     int stride = gridDim.x * blockDim.x;
16 |     for (int i = idx; i < N + stride; i += stride) {
17 |         a[i] *= 2;
18 |     }
19 | }
20 | 
21 | bool checkElementsAreDoubled(int *a, const int N)
22 | {
23 |     for (int i = 0; i < N; i++) {
24 |         if (a[i] != i * 2)
25 |             return false;
26 |     }
27 | 
28 |     return true;
29 | }
30 | 
31 | int main()
32 | {
33 |     int N = 1000;
34 |     int *a;
35 | 
36 |     size_t size = N * sizeof(int);
37 | 
38 |     // Use 'cudaMallocManaged' to allocate pointer 'a' available
39 |     // on both the host and the device.
40 |     cudaMallocManaged(&a, size);
41 | 
42 |     init(a, N);
43 | 
44 |     size_t threads_per_block = 1024;
45 |     size_t number_of_blocks = 32;
46 | 
47 |     cudaError_t syncErr, asyncErr;
48 | 
49 |     doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
50 | 
51 |     // catch errors for both the kernel launch above and any errors that
52 |     // occur during the asynchronous 'doubleElements' kernel execution.
53 |     syncErr = cudaGetLastError();
54 |     asyncErr = cudaDeviceSynchronize();
55 | 
56 |     // print errors should they exist.
57 |     if (syncErr != cudaSuccess)
58 |         printf("Error(sync): %s\n", cudaGetErrorString(syncErr));
59 |     if (asyncErr != cudaSuccess)
60 |         printf("Error(async): %s\n", cudaGetErrorString(asyncErr));
61 | 
62 |     bool areDoubled = checkElementsAreDoubled(a, N);
63 |     printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
64 | 
65 |     // Use 'cudaFree' to free memory allocated with 'cudaMallocManaged'
66 |     cudaFree(a);
67 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/09_vector-add.cu:
--------------------------------------------------------------------------------
 1 | // 09_vector-add.cu
 2 | #include <stdio.h>
 3 | #include <assert.h>
 4 | 
 5 | inline cudaError_t checkCuda(cudaError_t result)
 6 | {
 7 |     if (result != cudaSuccess) {
 8 |         fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
 9 |         assert(result == cudaSuccess);
10 |     }
11 |     return result;
12 | }
13 | 
14 | void initWith(float num, float* a, const int N)
15 | {
16 |     for (int i = 0; i < N; i++) {
17 |         a[i] = num;
18 |     }
19 | }
20 | 
21 | __global__
22 | void addVectorsInto(float* result, float* a, float* b, const int N)
23 | {
24 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
25 |     int stride = blockDim.x * gridDim.x;
26 | 
27 |     for (int i = idx; i < N; i += stride) {
28 |         result[i] = a[i] + b[i];
29 |     }
30 | }
31 | 
32 | void checkElementsAre(float target, float* array, const int N)
33 | {
34 |     for (int i = 0; i < N; i++) {
35 |         if (array[i] != target) {
36 |             printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
37 |             exit(1);
38 |         }
39 |     }
40 |     printf("SUCCESS! All values added correctly.\n");
41 | }
42 | 
43 | int main()
44 | {
45 |     const int N = 2 << 20;
46 |     size_t size = N * sizeof(float);
47 | 
48 |     float *a, *b, *c;
49 | 
50 |     checkCuda(cudaMallocManaged(&a, size));
51 |     checkCuda(cudaMallocManaged(&b, size));
52 |     checkCuda(cudaMallocManaged(&c, size));
53 | 
54 |     initWith(3, a, N);
55 |     initWith(4, b, N);
56 |     initWith(0, c, N);
57 | 
58 |     size_t threadsPerBlock = 1024;
59 |     size_t numberOfBlocks = 1;
60 | 
61 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
62 | 
63 |     checkCuda(cudaGetLastError());
64 |     checkCuda(cudaDeviceSynchronize());
65 | 
66 |     checkElementsAre(7, c, N);
67 | 
68 |     checkCuda(cudaFree(a));
69 |     checkCuda(cudaFree(b));
70 |     checkCuda(cudaFree(c));
71 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/10_matrix-multiply-2d.cu:
--------------------------------------------------------------------------------
 1 | // 10_matrix-multiply-2d.cu
 2 | #include <stdio.h>
 3 | 
 4 | #define N 64
 5 | 
 6 | __global__
 7 | void matrixMulGPU(int* a, int* b, int* c)
 8 | {
 9 |     int val = 0;
10 | 
11 |     int row = blockIdx.x * blockDim.x + threadIdx.x;
12 |     int col = blockIdx.y * blockDim.y + threadIdx.y;
13 | 
14 |     if (row < N && col < N) {
15 |         for (int k = 0; k < N; k++) {
16 |             val += a[row * N + k] * b[k * N + col];
17 |         }
18 |         c[row * N + col] = val;
19 |     }
20 | }
21 | 
22 | void matrixMulCPU(int* a, int* b, int* c)
23 | {
24 |     int val = 0;
25 | 
26 |     for (int row = 0; row < N; row++) {
27 |         for (int col = 0; col < N; col++) {
28 |             val = 0;
29 |             for (int k = 0; k < N; k++) {
30 |                 val += a[row * N + k] * b[k * N + col];
31 |             }
32 |             c[row * N + col] = val;
33 |         }
34 |     }
35 | }
36 | 
37 | int main()
38 | {
39 |     int *a, *b, *c_cpu, *c_gpu;
40 | 
41 |     size_t size = N * N * sizeof(int); // The number of bytes of an N x N matrix
42 | 
43 |     // Allocate Memory
44 |     cudaMallocManaged(&a, size);
45 |     cudaMallocManaged(&b, size);
46 |     cudaMallocManaged(&c_cpu, size);
47 |     cudaMallocManaged(&c_gpu, size);
48 | 
49 |     // Initialize Memory
50 |     for (int row = 0; row < N; row++) {
51 |         for (int col = 0; col < N; col++) {
52 |             a[row * N + col] = row;
53 |             b[row * N + col] = col + 2;
54 |             c_cpu[row * N + col] = 0;
55 |             c_gpu[row * N + col] = 0;
56 |         }
57 |     }
58 | 
59 |     // configuration
60 |     dim3 threads_per_block(16, 16, 1); // A 16 x 16 block threads
61 |     dim3 number_of_blocks((N / threads_per_block.x) + 1, (N / threads_per_block.y) + 1, 1);
62 | 
63 |     matrixMulGPU<<<number_of_blocks, threads_per_block>>>(a, b, c_gpu);
64 | 
65 |     cudaDeviceSynchronize();
66 | 
67 |     // Call the CPU version to check
68 |     matrixMulCPU(a, b, c_cpu);
69 | 
70 |     // Compare the two answers
71 |     bool error = false;
72 |     for (int row = 0; row < N; row++) {
73 |         for (int col = 0; col < N; col++) {
74 |             if (c_cpu[row * N + col] != c_gpu[row * N + col]) {
75 |                 printf("FOUND ERROR at c[%d][%d]\n", row, col);
76 |                 error = true;
77 |                 break;
78 |             }
79 |         }
80 |     }
81 | 
82 |     if (!error) {
83 |         printf("Success!\n");
84 |     }
85 | 
86 |     // Free all allocated memory
87 |     cudaFree(a);
88 |     cudaFree(b);
89 |     cudaFree(c_cpu);
90 |     cudaFree(c_gpu);
91 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/11_get-device-properties.cu:
--------------------------------------------------------------------------------
 1 | // 11_get-device-properties
 2 | #include <stdio.h>
 3 | 
 4 | int main()
 5 | {
 6 |     /*
 7 |      * Device ID is required first to query the device.
 8 |      */
 9 | 
10 |     int deviceId;
11 |     cudaGetDevice(&deviceId);
12 | 
13 |     cudaDeviceProp props;
14 |     cudaGetDeviceProperties(&props, deviceId);
15 | 
16 |     /*
17 |      * `props` now contains several properties about the current device.
18 |      */
19 | 
20 |     int computeCapabilityMajor = props.major;
21 |     int computeCapabilityMinor = props.minor;
22 |     int multiProcessorCount = props.multiProcessorCount;
23 |     int warpSize = props.warpSize;
24 | 
25 |     printf("Device ID: %d\nNumber of SMs: %d\nCompute Capability Major: %d\nCompute Capability Minor: %d\nWarp Size: %d\n", deviceId, multiProcessorCount, computeCapabilityMajor, computeCapabilityMinor, warpSize);
26 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/12_page-faults.cu:
--------------------------------------------------------------------------------
 1 | // 12_page-faults.cu
 2 | 
 3 | __global__
 4 | void deviceKernel(int *a, const int N)
 5 | {
 6 |     int idx = blockIdx.x + blockIdx.x * blockDim.x;
 7 |     int stride = blockDim.x * gridDim.x;
 8 | 
 9 |     for (int i = idx; i < N; i += stride) {
10 |         a[i] = i;
11 |     }
12 | }
13 | 
14 | void hostFunction(int *a, const int N)
15 | {
16 |     for (int i = 0; i < N; i++) {
17 |         a[i] = i;
18 |     }
19 | }
20 | 
21 | int main()
22 | {
23 |     int N = 2 << 24;
24 |     size_t size = N * sizeof(int);
25 | 
26 |     int *a;
27 |     cudaMallocManaged(&a, size);
28 | 
29 |     /*
30 |      * Conduct experiments to learn more about the behavior of
31 |      * `cudaMallocManaged`.
32 |      *
33 |      * What happens when unified memory is accessed only by the GPU?
34 |      *   deviceKernel(a, N);
35 |      *   cudaDeviceSynchronize();
36 |      * What happens when unified memory is accessed only by the CPU?
37 |      *   hostFunction<<<256, 256>>>(a, N);
38 |      *   cudaDeviceSynchronize();
39 |      * What happens when unified memory is accessed first by the GPU then the CPU?
40 |      *   deviceKernel<<<256, 256>>>(a, N)
41 |      *   cudaDeviceSynchronize();
42 |      *   hostFunction(a, N);
43 |      * What happens when unified memory is accessed first by the CPU then the GPU?
44 |      *   hostFunction(a, N);
45 |      *   deviceKernel<<<256, 256>>>(a, N);
46 |      *   cudaDeviceSynchronize();
47 |      *
48 |      * Hypothesize about UM behavior, page faulting specificially, before each
49 |      * experiment, and then verify by running `nsys`.
50 |      */
51 |     
52 | 
53 |     cudaFree(a);
54 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/13_print-numbers.cu:
--------------------------------------------------------------------------------
 1 | // 13_print-numbers.cu
 2 | #include <stdio.h>
 3 | 
 4 | __global__
 5 | void printNumber(int number)
 6 | {
 7 |     printf("%d\n", number);
 8 | }
 9 | 
10 | int main()
11 | {
12 |     for (int i = 0; i < 5; i++) {
13 |         cudaStream_t stream;
14 |         cudaStreamCreate(&stream);
15 |         printNumber<<<1, 1, 0, stream>>>(i);
16 |         cudaStreamDestroy(stream);
17 |     }
18 | 
19 |     cudaDeviceSynchronize();
20 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/14_n-body.cu:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include "timer.h"
  5 | #include "files.h"
  6 | 
  7 | #define SOFTENING 1e-9f
  8 | 
  9 | /*
 10 |  * Each body contains x, y, and z coordinate positions,
 11 |  * as well as velocities in the x, y, and z directions.
 12 |  */
 13 | 
 14 | typedef struct { float x, y, z, vx, vy, vz; } Body;
 15 | 
 16 | /*
 17 |  * Calculate the gravitational impact of all bodies in the system
 18 |  * on all others.
 19 |  */
 20 | 
 21 | __global__
 22 | void bodyForce(Body *p, float dt, int n) {
 23 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 24 |     int stride = blockDim.x * gridDim.x;
 25 |     
 26 |     for (int i = idx; i < n; i += stride) {
 27 |         float Fx = 0.f, Fy = 0.f, Fz = 0.f;
 28 |         
 29 |         for (int j = 0; j < n; j++) {
 30 |             float dx = p[j].x - p[i].x;
 31 |             float dy = p[j].y - p[i].y;
 32 |             float dz = p[j].z - p[i].z;
 33 |             float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
 34 |             float invDist = rsqrtf(distSqr);
 35 |             float invDist3 = invDist * invDist * invDist;
 36 |             
 37 |             Fx += dx * invDist3;
 38 |             Fy += dy * invDist3;
 39 |             Fz += dz * invDist3;
 40 |         }
 41 |         
 42 |         p[i].vx += dt * Fx;
 43 |         p[i].vy += dt * Fy;
 44 |         p[i].vz += dt * Fz;
 45 |     }
 46 | }
 47 | 
 48 | __global__
 49 | void intergratePosition(Body *p, float dt, int n)
 50 | {
 51 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 52 |     int stride = blockDim.x * gridDim.x;
 53 |     
 54 |     for (int i = idx ; i < n; i += stride) 
 55 |     {
 56 |         p[i].x += p[i].vx * dt;
 57 |         p[i].y += p[i].vy * dt;
 58 |         p[i].z += p[i].vz * dt;
 59 |     }
 60 | }
 61 | 
 62 | int main(const int argc, const char **argv)
 63 | {
 64 |     int nBodies = 2 << 11;
 65 |     if (argc > 1)
 66 |         nBodies = 2 << atoi(argv[1]);
 67 | 
 68 |     int deviceId;
 69 |     checkCuda(cudaGetDevice(&deviceId));
 70 |     cudaDeviceProp props;
 71 |     checkCuda(cudaGetDeviceProperties(&props, deviceId));
 72 | 
 73 |     size_t threadsPerBlock = props.maxThreadsPerBlock;
 74 |     size_t numberOfBlocks = props.multiProcessorCount;
 75 | 
 76 |     // The assessment will pass hidden initialized values to check for correctness.
 77 |     // You should not make changes to these files, or else the assessment will not work.
 78 |     const char *initialized_values;
 79 |     const char *solution_values;
 80 | 
 81 |     if (nBodies == 2 << 11)
 82 |     {
 83 |         initialized_values = "09-nbody/files/initialized_4096";
 84 |         solution_values = "09-nbody/files/solution_4096";
 85 |     }
 86 |     else
 87 |     { // nBodies == 2<<15
 88 |         initialized_values = "09-nbody/files/initialized_65536";
 89 |         solution_values = "09-nbody/files/solution_65536";
 90 |     }
 91 | 
 92 |     if (argc > 2)
 93 |         initialized_values = argv[2];
 94 |     if (argc > 3)
 95 |         solution_values = argv[3];
 96 | 
 97 |     const float dt = 0.01f; // Time step
 98 |     const int nIters = 10;  // Simulation iterations
 99 | 
100 |     int bytes = nBodies * sizeof(Body);
101 |     float *buf;
102 | 
103 |     cudaMallocManaged(&buf, bytes);
104 | 
105 |     Body *p = (Body *)buf;
106 | 
107 |     cudaMemPrefetchAsync(buf, bytes, cudaCpuDeviceId);
108 |     read_values_from_file(initialized_values, buf, bytes);
109 | 
110 |     double totalTime = 0.0;
111 | 
112 |     /*
113 |      * This simulation will run for 10 cycles of time, calculating gravitational
114 |      * interaction amongst bodies, and adjusting their positions to reflect.
115 |      */
116 |     cudaMemPrefetchAsync(buf, bytes, device_id);
117 | 
118 |     for (int iter = 0; iter < nIters; iter++)
119 |     {
120 |         StartTimer();
121 | 
122 |         bodyForce<<<numberOfBlocks, threadsPerBlock>>>(p, dt, nBodies); // compute interbody forces
123 |         intergratePosition<<<numberOfBlocks, threadsPerBlock>>>(p, dt, nBodies);
124 | 
125 |         cudaDeviceSynchronize();
126 | 
127 |         const double tElapsed = GetTimer() / 1000.0;
128 |         totalTime += tElapsed;
129 |     }
130 | 
131 |     double avgTime = totalTime / (double)(nIters);
132 |     float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
133 | 
134 |     cudaMemPrefetchAsync(buf, bytes, cudaCpuDeviceId);
135 |     write_values_to_file(solution_values, buf, bytes);
136 | 
137 |     // You will likely enjoy watching this value grow as you accelerate the application,
138 |     // but beware that a failure to correctly synchronize the device might result in
139 |     // unrealistically high values.
140 |     printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond);
141 | 
142 |     cudaFree(buf);
143 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/15_vector-add-manual-alloc.cu:
--------------------------------------------------------------------------------
 1 | // 15_vector-add-manual-alloc.cu
 2 | #include <stdio.h>
 3 | #include <assert.h>
 4 | 
 5 | inline cudaError_t checkCuda(cudaError_t result)
 6 | {
 7 |     if (result != cudaSuccess) {
 8 |         fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
 9 |         assert(result == cudaSuccess);
10 |     }
11 |     return result;
12 | }
13 | 
14 | __global__
15 | void initWith(float num, float *a, int N)
16 | {
17 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
18 |     int stride = blockDim.x * gridDim.x;
19 | 
20 |     for (int i = idx; i < N; i += stride)
21 |     {
22 |         a[i] = num;
23 |     }
24 | }
25 | 
26 | __global__
27 | void addVectorsInto(float* result, float* a, float* b, const int N)
28 | {
29 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
30 |     int stride = blockDim.x * gridDim.x;
31 | 
32 |     for (int i = idx; i < N; i += stride) {
33 |         result[i] = a[i] + b[i];
34 |     }
35 | }
36 | 
37 | void checkElementsAre(float target, float* array, const int N)
38 | {
39 |     for (int i = 0; i < N; i++) {
40 |         if (array[i] != target) {
41 |             printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
42 |             exit(1);
43 |         }
44 |     }
45 |     printf("SUCCESS! All values added correctly.\n");
46 | }
47 | 
48 | int main()
49 | {
50 |     const int N = 2 << 20;
51 |     size_t size = N * sizeof(float);
52 | 
53 |     int deviceId;
54 |     checkCuda(cudaGetDevice(&deviceId));
55 | 
56 |     cudaDeviceProp props;
57 |     checkCuda(cudaGetDeviceProperties(&props, deviceId));
58 | 
59 |     float *a, *b, *c, *h_c;
60 | 
61 |     checkCuda(cudaMalloc(&a, size));
62 |     checkCuda(cudaMalloc(&b, size));
63 |     checkCuda(cudaMalloc(&c, size));
64 |     checkCuda(cudaMallocHost(&h_c, size));
65 | 
66 |     size_t threadsPerBlock = props.maxThreadsPerBlock;
67 |     size_t numberOfBlocks = props.multiProcessorCount;
68 | 
69 |     cudaStream_t stream1, stream2, stream3;
70 |     checkCuda(cudaStreamCreate(&stream1));
71 |     checkCuda(cudaStreamCreate(&stream2));
72 |     checkCuda(cudaStreamCreate(&stream3));
73 | 
74 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);
75 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);
76 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);
77 | 
78 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
79 | 
80 |     checkCuda(cudaMemcpy(h_c, c, size, cudaMemcpyDeviceToHost));
81 |     
82 |     checkElementsAre(7, h_c, N);
83 | 
84 |     checkCuda(cudaStreamDestroy(stream1));
85 |     checkCuda(cudaStreamDestroy(stream2));
86 |     checkCuda(cudaStreamDestroy(stream3));
87 | 
88 |     checkCuda(cudaFree(a));
89 |     checkCuda(cudaFree(b));
90 |     checkCuda(cudaFree(c));
91 |     checkCuda(cudaFreeHost(h_c));
92 | }


--------------------------------------------------------------------------------
/CUDA/NVIDIA_Online_Training/Fundamentals_of_CUDA_C_C++/16_vector-add-overlap-xfer.cu:
--------------------------------------------------------------------------------
 1 | // 16_vector-add-overlap-xfer.cu
 2 | #include <stdio.h>
 3 | #include <assert.h>
 4 | 
 5 | inline cudaError_t checkCuda(cudaError_t result)
 6 | {
 7 |     if (result != cudaSuccess) {
 8 |         fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
 9 |         assert(result == cudaSuccess);
10 |     }
11 |     return result;
12 | }
13 | 
14 | __global__
15 | void initWith(float num, float *a, int N)
16 | {
17 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
18 |     int stride = blockDim.x * gridDim.x;
19 | 
20 |     for (int i = idx; i < N; i += stride)
21 |     {
22 |         a[i] = num;
23 |     }
24 | }
25 | 
26 | __global__
27 | void addVectorsInto(float* result, float* a, float* b, const int N)
28 | {
29 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
30 |     int stride = blockDim.x * gridDim.x;
31 | 
32 |     for (int i = idx; i < N; i += stride) {
33 |         result[i] = a[i] + b[i];
34 |     }
35 | }
36 | 
37 | void checkElementsAre(float target, float* array, const int N)
38 | {
39 |     for (int i = 0; i < N; i++) {
40 |         if (array[i] != target) {
41 |             printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
42 |             exit(1);
43 |         }
44 |     }
45 |     printf("SUCCESS! All values added correctly.\n");
46 | }
47 | 
48 | int main()
49 | {
50 |     const int N = 2 << 20;
51 |     size_t size = N * sizeof(float);
52 | 
53 |     int deviceId;
54 |     checkCuda(cudaGetDevice(&deviceId));
55 | 
56 |     cudaDeviceProp props;
57 |     checkCuda(cudaGetDeviceProperties(&props, deviceId));
58 | 
59 |     float *a, *b, *c, *h_c;
60 | 
61 |     checkCuda(cudaMalloc(&a, size));
62 |     checkCuda(cudaMalloc(&b, size));
63 |     checkCuda(cudaMalloc(&c, size));
64 |     checkCuda(cudaMallocHost(&h_c, size));
65 | 
66 |     size_t threadsPerBlock = props.maxThreadsPerBlock;
67 |     size_t numberOfBlocks = props.multiProcessorCount;
68 | 
69 |     cudaStream_t stream1, stream2, stream3;
70 |     checkCuda(cudaStreamCreate(&stream1));
71 |     checkCuda(cudaStreamCreate(&stream2));
72 |     checkCuda(cudaStreamCreate(&stream3));
73 | 
74 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);
75 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);
76 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);
77 | 
78 |     for (int i = 0; i < 4; ++i) {
79 |         cudaStream_t stream;
80 |         checkCuda(cudaStreamCreate(&stream));
81 | 
82 |         addVectorsInto<<<numberOfBlocks / 4, threadsPerBlock, 0, stream>>>(&c[i * N / 4], &a[i * N / 4], &b[i * N / 4], N / 4);
83 |         checkCuda(cudaMemcpyAsync(&h_c[i * N / 4], &c[i * N / 4], size / 4, cudaMemcpyDeviceToHost, stream));
84 |         checkCuda(cudaStreamDestroy(stream));
85 |     }
86 |     checkCuda(cudaDeviceSynchronize());
87 |     
88 |     checkElementsAre(7, h_c, N);
89 | 
90 |     checkCuda(cudaStreamDestroy(stream1));
91 |     checkCuda(cudaStreamDestroy(stream2));
92 |     checkCuda(cudaStreamDestroy(stream3));
93 | 
94 |     checkCuda(cudaFree(a));
95 |     checkCuda(cudaFree(b));
96 |     checkCuda(cudaFree(c));
97 |     checkCuda(cudaFreeHost(h_c));
98 | }


--------------------------------------------------------------------------------
/CUDA/StreamsAndEvents/asyncAPI.cu:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        asyncAPI.cu
 3 |  * Description: This is an example of using CUDA events to control asynchronous
 4 |  *              work launched on the GPU. In this example, asynchronous copies
 5 |  *              and an asynchronous kernel are used. A CUDA event is used to
 6 |  *              determine when that work has completed.
 7 |  *              
 8 |  * Compile:     nvcc -o asyncAPI asyncAPI.cu -I..
 9 |  * Run:         ./asyncAPI
10 |  *****************************************************************************/
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | #include <cuda_runtime.h>
14 | #include "common/common.h"
15 | 
16 | __global__
17 | void kernel(float* g_data, float value)
18 | {
19 |     int idx = blockDim.x * blockIdx.x + threadIdx.x;
20 |     g_data[idx] = g_data[idx] + value;
21 | }
22 | 
23 | bool checkResult(float* data, const int N, const float x)
24 | {
25 |     for (int i = 0; i < N; i++) {
26 |         if (data[i] != x) {
27 |             printf("Error! data[%d] = %f, ref = %f\n", i, data[i], x);
28 |             return false;
29 |         }
30 |     }
31 | 
32 |     return true;
33 | }
34 | 
35 | int main(int argc, char** argv)
36 | {
37 |     int dev = 0;
38 |     cudaDeviceProp deviceProp;
39 |     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
40 |     printf("> Using device %d: %s\n", dev, deviceProp.name);
41 |     CUDA_CHECK(cudaSetDevice(dev));
42 | 
43 |     int num = 1 << 24;
44 |     int nBytes = num * sizeof(float);
45 |     float value = 10.0f;
46 | 
47 |     // allocate host memory
48 |     float *h_a;
49 |     CUDA_CHECK(cudaMallocHost((void**)&h_a, nBytes));
50 |     memset(h_a, 0, nBytes);
51 | 
52 |     // allocate device memory
53 |     float *d_a;
54 |     CUDA_CHECK(cudaMalloc((void**)&d_a, nBytes));
55 |     CUDA_CHECK(cudaMemset(d_a, 255, nBytes));
56 | 
57 |     // set kernel launch configuration
58 |     dim3 block = dim3(512);
59 |     dim3 grid = dim3((num + block.x - 1) / block.x);
60 | 
61 |     // create cuda event handles
62 |     cudaEvent_t stop;
63 |     CUDA_CHECK(cudaEventCreate(&stop));
64 | 
65 |     // asynchronously issue work to the GPU (all to stream 0)
66 |     CUDA_CHECK(cudaMemcpyAsync(d_a, h_a, nBytes, cudaMemcpyHostToDevice));
67 |     kernel<<<grid, block>>>(d_a, value);
68 |     CUDA_CHECK(cudaMemcpyAsync(h_a, d_a, nBytes, cudaMemcpyDeviceToHost));
69 |     CUDA_CHECK(cudaEventRecord(stop));
70 | 
71 |     // have CPU do some work while waiting for stage 1 to finish
72 |     unsigned long int counter = 0;
73 |     while (cudaEventQuery(stop) == cudaErrorNotReady)
74 |         counter++;
75 |     
76 |     // print the cpu and gpu times
77 |     printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
78 | 
79 |     // check the output for correctness
80 |     bool results = checkResult(h_a, num, value);
81 | 
82 |     // release resources
83 |     CUDA_CHECK(cudaEventDestroy(stop));
84 |     CUDA_CHECK(cudaFreeHost(h_a));
85 |     CUDA_CHECK(cudaFree(d_a));
86 |     CUDA_CHECK(cudaDeviceReset());
87 | 
88 |     return 0;
89 | }


--------------------------------------------------------------------------------
/CUDA/StreamsAndEvents/simpleCallback.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        simpleCallback.cu
  3 |  * Description: This is an example of using CUDA callbacks to trigger work on
  4 |  *              the host after the completion of asynchronous work on the device.
  5 |  *              In this example, NSTREAM CUDA streams are created and 4 kernels
  6 |  *              are launched asynchronously in each. Then, a callback is added
  7 |  *              at the completion of those asynchronous kernels that prints
  8 |  *              prints diagnostic information.
  9 |  *              
 10 |  * Compile:     nvcc -o simpleCallback simpleCallback.cu -I..
 11 |  * Run:         ./simpleCallback
 12 |  *****************************************************************************/
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <cuda_runtime.h>
 16 | #include "common/common.h"
 17 | 
 18 | #define N 100000
 19 | #define NSTREAM 4
 20 | 
 21 | void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void* data)
 22 | {
 23 |     printf("callback from stream %d\n", *((int*)data));
 24 | }
 25 | 
 26 | __global__ void kernel_1()
 27 | {
 28 |     double sum = 0.0;
 29 |     for (int i = 0; i < N; i++) {
 30 |         sum = sum + tan(0.1) * tan(0.1);
 31 |     }
 32 | }
 33 | 
 34 | __global__ void kernel_2()
 35 | {
 36 |     double sum = 0.0;
 37 |     for (int i = 0; i < N; i++) {
 38 |         sum = sum + tan(0.1) * tan(0.1);
 39 |     }
 40 | }
 41 | 
 42 | __global__ void kernel_3()
 43 | {
 44 |     double sum = 0.0;
 45 |     for (int i = 0; i < N; i++) {
 46 |         sum = sum + tan(0.1) * tan(0.1);
 47 |     }
 48 | }
 49 | 
 50 | __global__ void kernel_4()
 51 | {
 52 |     double sum = 0.0;
 53 |     for (int i = 0; i < N; i++) {
 54 |         sum = sum + tan(0.1) * tan(0.1);
 55 |     }
 56 | }
 57 | 
 58 | int main(int argc, char** argv)
 59 | {    
 60 |     int dev = 0;
 61 |     cudaDeviceProp deviceProp;
 62 |     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 63 |     printf("> Using device %d: %s\n", dev, deviceProp.name);
 64 |     CUDA_CHECK(cudaSetDevice(dev));
 65 | 
 66 |     // check if device support hyper-Q
 67 |     if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
 68 |         if (deviceProp.concurrentKernels == 0) {
 69 |             printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n");
 70 |             printf("> CUDA kernel runs will be serialized\n");
 71 |         }
 72 |         else {
 73 |             printf("> GPU does not support HyperQ\n");
 74 |             printf("> CUDA kernel runs will have limited concurrency\n");
 75 |         }
 76 |     }
 77 | 
 78 |     printf("> Compute Capability %d.%d hardware with %d multi-processors\n", deviceProp.major,
 79 |             deviceProp.minor, deviceProp.multiProcessorCount);
 80 |     
 81 |     // set up max connection
 82 |     char* iname = "CUDA_DEVICE_MAX_CONNECTIONS";
 83 |     _putenv_s(iname, "8");
 84 |     char* ivalue = getenv(iname);
 85 |     printf("> %s = %s\n", iname, ivalue);
 86 |     printf("> with streams = %d\n", NSTREAM);
 87 | 
 88 |     // allocate and initialize an array of stream handles
 89 |     cudaStream_t *streams = (cudaStream_t*)malloc(NSTREAM * sizeof(cudaStream_t));
 90 |     for (int i = 0; i < NSTREAM; i++) {
 91 |         CUDA_CHECK(cudaStreamCreate(&streams[i]));
 92 |     }
 93 | 
 94 |     dim3 block(1);
 95 |     dim3 grid(1);
 96 |     cudaEvent_t start, stop;
 97 |     CUDA_CHECK(cudaEventCreate(&start));
 98 |     CUDA_CHECK(cudaEventCreate(&stop));
 99 | 
100 |     int stream_ids[NSTREAM];
101 | 
102 |     CUDA_CHECK(cudaEventRecord(start, 0));
103 |     for (int i = 0; i < NSTREAM; i++) {
104 |         stream_ids[i] = i;
105 |         kernel_1<<<grid, block, 0, streams[i]>>>();
106 |         kernel_2<<<grid, block, 0, streams[i]>>>();
107 |         kernel_3<<<grid, block, 0, streams[i]>>>();
108 |         kernel_4<<<grid, block, 0, streams[i]>>>();
109 |         CUDA_CHECK(cudaStreamAddCallback(streams[i], my_callback, (void*)(stream_ids + i), 0));
110 |     }
111 |     CUDA_CHECK(cudaEventRecord(stop, 0));
112 |     CUDA_CHECK(cudaEventSynchronize(stop));
113 | 
114 |     float elapsed_time;
115 |     CUDA_CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
116 |     printf("Measured time for parallel execution = %.3fs\n", elapsed_time / 1000.f);
117 | 
118 |     // release all stream
119 |     for (int i = 0; i < NSTREAM; i++) {
120 |         CUDA_CHECK(cudaStreamDestroy(streams[i]));
121 |     }
122 |     free(streams);
123 | 
124 |     CUDA_CHECK(cudaDeviceReset());
125 |     return 0;
126 | }


--------------------------------------------------------------------------------
/CUDA/StreamsAndEvents/simpleHyperQBreadth.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        simpleHyperQBreadth.cu
  3 |  * Description: This is an example to demonstrates submitting work to a CUDA
  4 |  *              Stream in breadth-first order prevents false-dependecies from
  5 |  *              reducing the parallelism of an application. kernel_1, kernel_2,
  6 |  *              kernel_3, and kernel_4 simply implement identical, dummy computation.
  7 |  *              Seperate kernels are used to make the scheduling of these kernels
  8 |  *              simpler to visualize in the Visual Profiler.
  9 |  *              
 10 |  * Compile:     nvcc -o simpleHyperQBreadth simpleHyperQBreadth.cu -I..
 11 |  * Run:         ./simpleHyperQBreadth
 12 |  *****************************************************************************/
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <cuda_runtime.h>
 16 | #include "common/common.h"
 17 | 
 18 | #define N 1000
 19 | #define NSTREAM 4
 20 | 
 21 | __global__
 22 | void kernel_1()
 23 | {
 24 |     double sum = 0.0;
 25 | 
 26 |     for (int i = 0; i < N; i++) {
 27 |         sum = sum + tan(0.1) * tan(0.1);
 28 |         printf("%f\n", sum);
 29 |     }
 30 | }
 31 | 
 32 | __global__
 33 | void kernel_2()
 34 | {
 35 |     double sum = 0.0;
 36 | 
 37 |     for (int i = 0; i < N; i++) {
 38 |         sum = sum + tan(0.1) * tan(0.1);
 39 |         printf("%f\n", sum);
 40 |     }
 41 | }
 42 | 
 43 | __global__
 44 | void kernel_3()
 45 | {
 46 |     double sum = 0.0;
 47 | 
 48 |     for (int i = 0; i < N; i++) {
 49 |         sum = sum + tan(0.1) * tan(0.1);
 50 |         printf("%f\n", sum);
 51 |     }
 52 | }
 53 | 
 54 | __global__
 55 | void kernel_4()
 56 | {
 57 |     double sum = 0.0;
 58 | 
 59 |     for (int i = 0; i < N; i++) {
 60 |         sum = sum + tan(0.1) * tan(0.1);
 61 |         printf("%f\n", sum);
 62 |     }
 63 | }
 64 | 
 65 | int main(int argc, char** argv)
 66 | {
 67 |     int n_streams = NSTREAM;
 68 |     int isize = 1;
 69 |     int iblock = 1;
 70 |     int bigcase = 0;
 71 | 
 72 |     // get argument from command line
 73 |     if (argc > 1)
 74 |         n_streams = atoi(argv[1]);
 75 |     if (argc > 2)
 76 |         bigcase = atoi(argv[2]);
 77 |     
 78 |     float elapsed_time;
 79 | 
 80 |     // set up max connection
 81 |     char* iname = "CUDA_DEVICE_MAX_CONNECTIONS";
 82 |     _putenv_s(iname, "32");
 83 |     char* ivalue = getenv(iname);
 84 |     printf("%s = %s\n", iname, ivalue);
 85 | 
 86 |     int dev = 0;
 87 |     cudaDeviceProp deviceProp;
 88 |     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 89 |     printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name, n_streams);
 90 |     CUDA_CHECK(cudaSetDevice(dev));
 91 | 
 92 |     // check if device support hyper-Q
 93 |     if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
 94 |         if (deviceProp.concurrentKernels == 0) {
 95 |             printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n");
 96 |             printf("> CUDA kernel runs will be serialized\n");
 97 |         }
 98 |         else {
 99 |             printf("> GPU does not support HyperQ\n");
100 |             printf("> CUDA kernel runs will have limited concurrency\n");
101 |         }
102 |     }
103 | 
104 |     printf("> Compute Capability %d.%d hardware with %d multi-processors\n", deviceProp.major,
105 |             deviceProp.minor, deviceProp.multiProcessorCount);
106 |     
107 |     // Allocate and initialize an array of stream handles
108 |     cudaStream_t *streams = (cudaStream_t*)malloc(n_streams * sizeof(cudaStream_t));
109 | 
110 |     for (int i = 0; i < n_streams; i++) {
111 |         CUDA_CHECK(cudaStreamCreate(&(streams[i])));
112 |     }
113 |     
114 |     // run kernel with more threads
115 |     if (bigcase == 1) {
116 |         iblock = 512;
117 |         isize = 1 << 12;
118 |     }
119 | 
120 |     // setup execution configuration
121 |     dim3 block(iblock);
122 |     dim3 grid(isize / iblock);
123 |     printf("> grid %d block %d\n", grid.x, block.x);
124 | 
125 |     // create events
126 |     cudaEvent_t start, stop;
127 |     CUDA_CHECK(cudaEventCreate(&start));
128 |     CUDA_CHECK(cudaEventCreate(&stop));
129 | 
130 |     // record start event
131 |     CUDA_CHECK(cudaEventRecord(start, 0));
132 | 
133 |     // dispatch job with breadth first ordering
134 |     for (int i = 0; i < n_streams; i++) 
135 |         kernel_1<<<grid, block, 0, streams[i]>>>();
136 |     for (int i = 0; i < n_streams; i++) 
137 |         kernel_2<<<grid, block, 0, streams[i]>>>();
138 |     for (int i = 0; i < n_streams; i++) 
139 |         kernel_3<<<grid, block, 0, streams[i]>>>();
140 |     for (int i = 0; i < n_streams; i++) 
141 |         kernel_4<<<grid, block, 0, streams[i]>>>();
142 |     
143 |     // record stop event
144 |     CUDA_CHECK(cudaEventRecord(stop, 0));
145 |     CUDA_CHECK(cudaEventSynchronize(stop));
146 | 
147 |     // calculate elapsed time
148 |     CUDA_CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
149 |     printf("Measured time for parallel execution = %fs\n", elapsed_time / 1000.f);
150 | 
151 |     // release all streams
152 |     for (int i = 0; i < n_streams; i++) {
153 |         CUDA_CHECK(cudaStreamDestroy(streams[i]));
154 |     }
155 |     free(streams);
156 |     
157 |     // destory events
158 |     CUDA_CHECK(cudaEventDestroy(start));
159 |     CUDA_CHECK(cudaEventDestroy(stop));
160 | 
161 |     // reset device
162 |     CUDA_CHECK(cudaDeviceReset());
163 | 
164 |     return 0;
165 | }


--------------------------------------------------------------------------------
/CUDA/StreamsAndEvents/simpleHyperQDepth.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        simpleHyperQDepth.cu
  3 |  * Description: This is an example to demonstrates submitting work to a CUDA
  4 |  *              Stream in depth-first order. Work submission in depth-first order
  5 |  *              may introduce false-dependencies between unrelated tasks in
  6 |  *              different CUDA streams, limiting the parallelism of a CUDA application.
  7 |  *              kernel_1, kernel_2, kernel_3, and kernel_4 simply implement
  8 |  *              identical, dummy computation. Separate kernels are used to make
  9 |  *              the scheduling of these kernels simpler to visualize in the Visual
 10 |  *              Profiler.
 11 |  *              
 12 |  * Compile:     nvcc -o simpleHyperQDepth simpleHyperQDepth.cu -I..
 13 |  * Run:         ./simpleHyperQDepth
 14 |  *****************************************************************************/
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <cuda_runtime.h>
 18 | #include "common/common.h"
 19 | 
 20 | #define N 1000
 21 | #define NSTREAM 4
 22 | 
 23 | __global__
 24 | void kernel_1()
 25 | {
 26 |     double sum = 0.0;
 27 | 
 28 |     for (int i = 0; i < N; i++) {
 29 |         sum = sum + tan(0.1) * tan(0.1);
 30 |         printf("%f\n", sum);
 31 |     }
 32 | }
 33 | 
 34 | __global__
 35 | void kernel_2()
 36 | {
 37 |     double sum = 0.0;
 38 | 
 39 |     for (int i = 0; i < N; i++) {
 40 |         sum = sum + tan(0.1) * tan(0.1);
 41 |         printf("%f\n", sum);
 42 |     }
 43 | }
 44 | 
 45 | __global__
 46 | void kernel_3()
 47 | {
 48 |     double sum = 0.0;
 49 | 
 50 |     for (int i = 0; i < N; i++) {
 51 |         sum = sum + tan(0.1) * tan(0.1);
 52 |         printf("%f\n", sum);
 53 |     }
 54 | }
 55 | 
 56 | __global__
 57 | void kernel_4()
 58 | {
 59 |     double sum = 0.0;
 60 | 
 61 |     for (int i = 0; i < N; i++) {
 62 |         sum = sum + tan(0.1) * tan(0.1);
 63 |         printf("%f\n", sum);
 64 |     }
 65 | }
 66 | 
 67 | int main(int argc, char** argv)
 68 | {
 69 |     int n_streams = NSTREAM;
 70 |     int isize = 1;
 71 |     int iblock = 1;
 72 |     int bigcase = 0;
 73 | 
 74 |     // get argument from command line
 75 |     if (argc > 1)
 76 |         n_streams = atoi(argv[1]);
 77 |     if (argc > 2)
 78 |         bigcase = atoi(argv[2]);
 79 |     
 80 |     float elapsed_time;
 81 | 
 82 |     // set up max connection
 83 |     char* iname = "CUDA_DEVICE_MAX_CONNECTIONS";
 84 |     _putenv_s(iname, "32");
 85 |     char* ivalue = getenv(iname);
 86 |     printf("%s = %s\n", iname, ivalue);
 87 | 
 88 |     int dev = 0;
 89 |     cudaDeviceProp deviceProp;
 90 |     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 91 |     printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name, n_streams);
 92 |     CUDA_CHECK(cudaSetDevice(dev));
 93 | 
 94 |     // check if device support hyper-Q
 95 |     if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
 96 |         if (deviceProp.concurrentKernels == 0) {
 97 |             printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n");
 98 |             printf("> CUDA kernel runs will be serialized\n");
 99 |         }
100 |         else {
101 |             printf("> GPU does not support HyperQ\n");
102 |             printf("> CUDA kernel runs will have limited concurrency\n");
103 |         }
104 |     }
105 | 
106 |     printf("> Compute Capability %d.%d hardware with %d multi-processors\n", deviceProp.major,
107 |             deviceProp.minor, deviceProp.multiProcessorCount);
108 |     
109 |     // Allocate and initialize an array of stream handles
110 |     cudaStream_t *streams = (cudaStream_t*)malloc(n_streams * sizeof(cudaStream_t));
111 | 
112 |     for (int i = 0; i < n_streams; i++) {
113 |         CUDA_CHECK(cudaStreamCreate(&(streams[i])));
114 |     }
115 |     
116 |     // run kernel with more threads
117 |     if (bigcase == 1) {
118 |         iblock = 512;
119 |         isize = 1 << 12;
120 |     }
121 | 
122 |     // setup execution configuration
123 |     dim3 block(iblock);
124 |     dim3 grid(isize / iblock);
125 |     printf("> grid %d block %d\n", grid.x, block.x);
126 | 
127 |     // create events
128 |     cudaEvent_t start, stop;
129 |     CUDA_CHECK(cudaEventCreate(&start));
130 |     CUDA_CHECK(cudaEventCreate(&stop));
131 | 
132 |     // record start event
133 |     CUDA_CHECK(cudaEventRecord(start, 0));
134 | 
135 |     // dispatch job with depth first ordering
136 |     for (int i = 0;i < n_streams; i++) {
137 |         kernel_1<<<grid, block, 0, streams[i]>>>();
138 |         kernel_2<<<grid, block, 0, streams[i]>>>();
139 |         kernel_3<<<grid, block, 0, streams[i]>>>();
140 |         kernel_4<<<grid, block, 0, streams[i]>>>();
141 |     }
142 |     
143 |     // record stop event
144 |     CUDA_CHECK(cudaEventRecord(stop, 0));
145 |     CUDA_CHECK(cudaEventSynchronize(stop));
146 | 
147 |     // calculate elapsed time
148 |     CUDA_CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
149 |     printf("Measured time for parallel execution = %fs\n", elapsed_time / 1000.f);
150 | 
151 |     // release all streams
152 |     for (int i = 0; i < n_streams; i++) {
153 |         CUDA_CHECK(cudaStreamDestroy(streams[i]));
154 |     }
155 |     free(streams);
156 |     
157 |     // destory events
158 |     CUDA_CHECK(cudaEventDestroy(start));
159 |     CUDA_CHECK(cudaEventDestroy(stop));
160 | 
161 |     // reset device
162 |     CUDA_CHECK(cudaDeviceReset());
163 | 
164 |     return 0;
165 | }


--------------------------------------------------------------------------------
/CUDA/UnifiedMemory/matrixAddWithUnifiedMemory.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        matrixAddWithUnifiedMemory.cu
  3 |  * Description: This is an example to demonstrates the use of CUDA managed memory
  4 |  *              to implement matrix addition. In this example, arbitrary pointers
  5 |  *              can be dereferenced on the host and device.
  6 |  *              CUDA will automatically manage the transfer of data to and from 
  7 |  *              the GPU as needed by the application.
  8 |  *              
  9 |  *              There is no need for the programmer to use cudaMemcpy, 
 10 |  *              cudaHostGetDevicePointer, or any other CUDA API involved with
 11 |  *              explicitly transferring data.
 12 |  *              
 13 |  *              
 14 |  * Compile:     nvcc -O3 -o managed matrixAddWithUnifiedMemory.cu -I..
 15 |  * Run:         ./managed
 16 |  *                  [n]: power to set size of input matrix (default: 12)
 17 |  *****************************************************************************/
 18 | #include <stdio.h>
 19 | #include <cuda_runtime.h>
 20 | #include "common/common.h"
 21 | 
 22 | void initialData(float* in, const int size)
 23 | {
 24 |     for (int i = 0; i < size; i++)
 25 |         in[i] = (rand() & 0xFF) / 10.f;
 26 | }
 27 | 
 28 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
 29 | {
 30 |     float* ia = A;
 31 |     float* ib = B;
 32 |     float* ic = C;
 33 | 
 34 |     for (int iy = 0; iy < ny; iy++) {
 35 |         for (int ix = 0; ix < nx; ix++) {
 36 |             ic[ix] = ia[ix] + ib[ix];
 37 |         }
 38 | 
 39 |         ia += nx;
 40 |         ib += nx;
 41 |         ic += nx;
 42 |     }
 43 | }
 44 | 
 45 | void checkResult(float* hostRef, float* gpuRef, const int size)
 46 | {
 47 |     double epsilon = 1.0e-8;
 48 | 
 49 |     for (int i = 0; i < size; i++) {
 50 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
 51 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef[i], gpuRef[i]);
 52 |             break;
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | __global__
 58 | void sumMatrixOnGPU(float* A, float* B, float* C, const int nx, const int ny)
 59 | {
 60 |     unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
 61 |     unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
 62 |     unsigned int idx = iy * nx + ix;
 63 | 
 64 |     if (ix < nx && iy < ny)
 65 |         C[idx] = A[idx] + B[idx];
 66 | }
 67 | 
 68 | int main(int argc, char** argv)
 69 | {
 70 |     // setup device
 71 |     int dev = 0;
 72 |     cudaDeviceProp deviceProp;
 73 |     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 74 |     printf("Starting sumMatrix at device %d: %s\n", dev, deviceProp.name);
 75 |     CUDA_CHECK(cudaSetDevice(dev));
 76 | 
 77 |     // setup size of matrix
 78 |     int nx, ny;
 79 |     int power = 12;
 80 |     if (argc > 1)
 81 |         power = atoi(argv[1]);
 82 |     nx = ny = 1 << power;
 83 | 
 84 |     int nxy = nx * ny;
 85 |     size_t nBytes = nxy * sizeof(float);
 86 |     printf("Matrix size: nx %d ny %d\n", nx, ny);
 87 | 
 88 |     // malloc host memory
 89 |     float *A, *B, *hostRef, *gpuRef;
 90 |     CUDA_CHECK(cudaMallocManaged((void**)&A, nBytes));
 91 |     CUDA_CHECK(cudaMallocManaged((void**)&B, nBytes));
 92 |     CUDA_CHECK(cudaMallocManaged((void**)&hostRef, nBytes));
 93 |     CUDA_CHECK(cudaMallocManaged((void**)&gpuRef, nBytes));
 94 | 
 95 |     double start, finish;
 96 |     // initialize data at host side
 97 |     GET_TIME(start);
 98 |     initialData(A, nxy);
 99 |     initialData(B, nxy);
100 |     GET_TIME(finish);
101 |     printf("initialization: \t %f sec\n", finish - start);
102 | 
103 |     memset(hostRef, 0, nBytes);
104 |     memset(gpuRef, 0, nBytes);
105 | 
106 |     // add matrix at host side for result check
107 |     GET_TIME(start);
108 |     sumMatrixOnHost(A, B, hostRef, nx, ny);
109 |     GET_TIME(finish);
110 |     printf("sumMatrix on host:\t %f sec\n", finish - start);
111 | 
112 |     // invode kernel at host side
113 |     int dimX = 32;
114 |     int dimY = 32;
115 |     dim3 blocks(dimX, dimY);
116 |     dim3 grids((nx + blocks.x - 1) / blocks.x, (ny + blocks.y - 1) / blocks.y);
117 |     
118 |     // warm-up kernel
119 |     sumMatrixOnGPU<<<grids, blocks>>>(A, B, gpuRef, nx, ny);
120 |     CUDA_CHECK(cudaDeviceSynchronize());
121 | 
122 |     GET_TIME(start);
123 |     sumMatrixOnGPU<<<grids, blocks>>>(A, B, gpuRef, nx, ny);
124 |     CUDA_CHECK(cudaDeviceSynchronize());
125 |     GET_TIME(finish);
126 |     printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>>\n", finish-start, grids.x, grids.y, blocks.x, blocks.y);
127 | 
128 |     // check device results
129 |     checkResult(hostRef, gpuRef, nxy);
130 | 
131 |     // free device global memory
132 |     CUDA_CHECK(cudaFree(A));
133 |     CUDA_CHECK(cudaFree(B));
134 |     CUDA_CHECK(cudaFree(hostRef));
135 |     CUDA_CHECK(cudaFree(gpuRef));
136 | 
137 |     CUDA_CHECK(cudaDeviceReset());
138 |     return 0;
139 | }


--------------------------------------------------------------------------------
/CUDA/UnifiedMemory/matrixAddWithoutUnifiedMemory.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        matrixAddWithoutUnifiedMemory.cu
  3 |  * Description: This is an example to demonstrates the use of explicit CUDA memory
  4 |  *              transfer to implement matrix addition. This code contrasts with
  5 |  *              matrixAddWithUnifiedMemory.cu, where CUDA managed memory is used to
  6 |  *              remove all explicit memory transfers and abstract away the concept
  7 |  *              of physicall separate address space.
  8 |  *              
  9 |  *              
 10 |  * Compile:     nvcc -O3 -o manual matrixAddWithoutUnifiedMemory.cu -I..
 11 |  * Run:         ./manual
 12 |  *                  [n]: power to set size of input matrix (default: 12)
 13 |  *****************************************************************************/
 14 | #include <stdio.h>
 15 | #include <cuda_runtime.h>
 16 | #include "common/common.h"
 17 | 
 18 | void initialData(float* in, const int size)
 19 | {
 20 |     for (int i = 0; i < size; i++)
 21 |         in[i] = (rand() & 0xFF) / 10.f;
 22 | }
 23 | 
 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
 25 | {
 26 |     float* ia = A;
 27 |     float* ib = B;
 28 |     float* ic = C;
 29 | 
 30 |     for (int iy = 0; iy < ny; iy++) {
 31 |         for (int ix = 0; ix < nx; ix++) {
 32 |             ic[ix] = ia[ix] + ib[ix];
 33 |         }
 34 | 
 35 |         ia += nx;
 36 |         ib += nx;
 37 |         ic += nx;
 38 |     }
 39 | }
 40 | 
 41 | void checkResult(float* hostRef, float* gpuRef, const int size)
 42 | {
 43 |     double epsilon = 1.0e-8;
 44 | 
 45 |     for (int i = 0; i < size; i++) {
 46 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
 47 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef[i], gpuRef[i]);
 48 |             break;
 49 |         }
 50 |     }
 51 | }
 52 | 
 53 | __global__
 54 | void sumMatrixOnGPU(float* A, float* B, float* C, const int nx, const int ny)
 55 | {
 56 |     unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
 57 |     unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
 58 |     unsigned int idx = iy * nx + ix;
 59 | 
 60 |     if (ix < nx && iy < ny)
 61 |         C[idx] = A[idx] + B[idx];
 62 | }
 63 | 
 64 | int main(int argc, char** argv)
 65 | {
 66 |     // setup device
 67 |     int dev = 0;
 68 |     cudaDeviceProp deviceProp;
 69 |     CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 70 |     printf("Starting sumMatrix at device %d: %s\n", dev, deviceProp.name);
 71 |     CUDA_CHECK(cudaSetDevice(dev));
 72 | 
 73 |     // setup size of matrix
 74 |     int nx, ny;
 75 |     int power = 12;
 76 |     if (argc > 1)
 77 |         power = atoi(argv[1]);
 78 |     nx = ny = 1 << power;
 79 | 
 80 |     int nxy = nx * ny;
 81 |     size_t nBytes = nxy * sizeof(float);
 82 |     printf("Matrix size: nx %d ny %d\n", nx, ny);
 83 | 
 84 |     // malloc host memory
 85 |     float *h_A, *h_B, *hostRef, *gpuRef;
 86 |     h_A = (float*)malloc(nBytes);
 87 |     h_B = (float*)malloc(nBytes);
 88 |     hostRef = (float*)malloc(nBytes);
 89 |     gpuRef = (float*)malloc(nBytes);
 90 | 
 91 |     double start, finish;
 92 |     // initialize data at host side
 93 |     GET_TIME(start);
 94 |     initialData(h_A, nxy);
 95 |     initialData(h_B, nxy);
 96 |     GET_TIME(finish);
 97 |     printf("initialization: \t %f sec\n", finish - start);
 98 | 
 99 |     memset(hostRef, 0, nBytes);
100 |     memset(gpuRef, 0, nBytes);
101 | 
102 |     // add matrix at host side for result check
103 |     GET_TIME(start);
104 |     sumMatrixOnHost(h_A, h_B, hostRef, nx, ny);
105 |     GET_TIME(finish);
106 |     printf("sumMatrix on host:\t %f sec\n", finish - start);
107 | 
108 |     // malloc device global memory
109 |     float *d_A, *d_B, *d_C;
110 |     CUDA_CHECK(cudaMalloc((void**)&d_A, nBytes));
111 |     CUDA_CHECK(cudaMalloc((void**)&d_B, nBytes));
112 |     CUDA_CHECK(cudaMalloc((void**)&d_C, nBytes));
113 | 
114 |     // invoke kernel at host side
115 |     int dimX = 32;
116 |     int dimY = 32;
117 |     dim3 blocks(dimX, dimY);
118 |     dim3 grids((nx + blocks.x - 1) / blocks.x, (ny + blocks.y - 1) / blocks.y);
119 |     
120 |     // warm-up kernel
121 |     CUDA_CHECK(cudaMemset(d_A, 0.0f, nBytes));
122 |     CUDA_CHECK(cudaMemset(d_B, 0.0f, nBytes));
123 |     sumMatrixOnGPU<<<grids, blocks>>>(d_A, d_B, d_C, nx, ny);
124 |     CUDA_CHECK(cudaDeviceSynchronize());
125 | 
126 |     // transfer data from host to device
127 |     CUDA_CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
128 |     CUDA_CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
129 | 
130 |     GET_TIME(start);
131 |     sumMatrixOnGPU<<<grids, blocks>>>(d_A, d_B, d_C, nx, ny);
132 |     CUDA_CHECK(cudaDeviceSynchronize());
133 |     GET_TIME(finish);
134 |     printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>>\n", finish-start, grids.x, grids.y, blocks.x, blocks.y);
135 | 
136 |     CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
137 | 
138 |     // check device results
139 |     checkResult(hostRef, gpuRef, nxy);
140 | 
141 |     // free device global memory
142 |     CUDA_CHECK(cudaFree(d_A));
143 |     CUDA_CHECK(cudaFree(d_B));
144 |     CUDA_CHECK(cudaFree(d_C));
145 | 
146 |     // free host memory
147 |     free(h_A);
148 |     free(h_B);
149 |     free(hostRef);
150 |     free(gpuRef);
151 | 
152 |     CUDA_CHECK(cudaDeviceReset());
153 |     return 0;
154 | }


--------------------------------------------------------------------------------
/CUDA/bezierCurves/bezierCurves.cuh:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cuda.h>
 3 | 
 4 | #define MAX_TESS_POINTS 32
 5 | #define N_LINES 256
 6 | #define BLOCK_DIM 32
 7 | 
 8 | // A structure containing all paramters needed to tessellate a Bezier line
 9 | struct BezierLine
10 | {
11 |     float2 CP[3];                       // Control Points for the line
12 |     float2 vertexPos[MAX_TESS_POINTS];  // Vertex position array to tessellate into
13 |     int nVertices;                      // Number of tessellated vertices
14 | };
15 | 
16 | __forceinline__ __device__ float2 operator+(float2 a, float2 b)
17 | {
18 |     float2 c;
19 |     c.x = a.x + b.x;
20 |     c.y = a.y + b.y;
21 |     return c;
22 | }
23 | 
24 | __forceinline__ __device__ float2 operator-(float2 a, float2 b)
25 | {
26 |     float2 c;
27 |     c.x = a.x - b.x;
28 |     c.y = a.y - b.y;
29 |     return c;
30 | }
31 | 
32 | __forceinline__ __device__ float2 operator*(float a, float2 b)
33 | {
34 |     float2 c;
35 |     c.x = a * b.x;
36 |     c.y = a * b.y;
37 |     return c;
38 | }
39 | 
40 | __forceinline__ __device__ float length(float2 a)
41 | {
42 |     return sqrtf((a.x * a.x) + (a.y * a.y));
43 | }
44 | 
45 | __forceinline__ __device__ float computeCurvature(BezierLine *bLines)
46 | {
47 |     int bIdx = blockIdx.x;
48 |     float curvature = length(bLines[bIdx].CP[1] - 0.5f * (bLines[bIdx].CP[0] + bLines[bIdx].CP[2])) 
49 |                 / length(bLines[bIdx].CP[2] - bLines[bIdx].CP[0]);
50 | 
51 |     return curvature;
52 | }
53 | 
54 | void initializeBLines(BezierLine *bLines_h)
55 | {
56 |     float2 last = {0, 0};
57 |     for (int i = 0; i < N_LINES; i++)
58 |     {
59 |         // Set first point of this line to last point of previous line
60 |         bLines_h[i].CP[0] = last;
61 |         for (int j = 1; j < 3; j++)
62 |         {
63 |             // Assign random corrdinate between 0 and 1
64 |             bLines_h[i].CP[j].x = (float)rand() / RAND_MAX;
65 |             bLines_h[i].CP[j].y = (float)rand() / RAND_MAX;
66 |         }
67 |         last = bLines_h[i].CP[2]; // keep the last point of this line
68 |         // Set numbeer of tessellated vertices to zero
69 |         bLines_h[i].nVertices = 0;
70 |     }
71 | }


--------------------------------------------------------------------------------
/CUDA/bezierCurves/bezierCurves1.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        bezierCurves1.cu
  3 |  * Description: Implement Bezier Curve Calculation without dynamic parallelism
  4 |  *              
  5 |  * Compile:     nvcc -o bezierCurves1 bezierCurves1.cu -I.. -I. $(pkg-config opencv4 --libs --cflags)
  6 |  * Run:         ./bezierCurves1
  7 |  * Argument:    n.a
  8 |  *****************************************************************************/
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <common/common.h>
 12 | 
 13 | #include <opencv2/highgui.hpp>
 14 | #include <opencv2/imgproc.hpp>
 15 | 
 16 | #include <cuda.h>
 17 | #include "bezierCurves.cuh"
 18 | 
 19 | __global__ void computeBezierLines(BezierLine* bLines, int nLines)
 20 | {
 21 |     int bIdx = blockIdx.x;
 22 |     if (bIdx < nLines) {
 23 |         // Compute the curvature of the line
 24 |         float curvature = computeCurvature(bLines);
 25 | 
 26 |         // From the curvature, compute the number of tessellation points
 27 |         int nTessPoints = min(max((int)(curvature*16.0f), 4), 32);
 28 |         bLines[bIdx].nVertices = nTessPoints;
 29 | 
 30 |         // Loop through vertices to be tessellated, incrementing by blockDim.x
 31 |         for (int i = 0; i < nTessPoints; i += blockDim.x) {
 32 |             int idx = i + threadIdx.x;  // compute a unique index for this point
 33 |             if (idx < nTessPoints) {
 34 |                 float u = (float)idx / (nTessPoints - 1);   // Compute u from idx
 35 |                 float omu = 1.0f - u;                       // pre-compute one minus u
 36 |                 float B3u[3];                               // Compute quadratic Bezier coefficients
 37 |                 B3u[0] = omu * omu;
 38 |                 B3u[1] = 2.0f * u * omu;
 39 |                 B3u[2] = u * u;
 40 |                 float2 position = {0, 0};
 41 |                 for (int j = 0; j < 3; j++) {
 42 |                     // Add the contribution of the j'th control point to position
 43 |                     position = position + (B3u[j] * bLines[bIdx].CP[j]);
 44 |                 }
 45 |                 // Assign value of vertex position to the correct array element
 46 |                 bLines[bIdx].vertexPos[idx] = position;
 47 |             }
 48 |         }
 49 |     }
 50 | }
 51 | 
 52 | // Main function
 53 | int main(int argc, char **argv)
 54 | {
 55 |     CUDA_CHECK(cudaSetDevice(0));
 56 | 
 57 |     BezierLine *bLines_h = new BezierLine[N_LINES];
 58 |     initializeBLines(bLines_h);
 59 | 
 60 |     BezierLine *bLines_d;
 61 |     CUDA_CHECK(cudaMalloc((void **)&bLines_d, N_LINES * sizeof(BezierLine)));
 62 |     CUDA_CHECK(cudaMemcpy(bLines_d, bLines_h, N_LINES * sizeof(BezierLine), cudaMemcpyHostToDevice));
 63 | 
 64 |     double start, finish;
 65 |     GET_TIME(start);
 66 |     computeBezierLines<<<N_LINES, BLOCK_DIM>>>(bLines_d, N_LINES);
 67 |     CUDA_CHECK(cudaMemcpy(bLines_h, bLines_d, N_LINES*sizeof(BezierLine), cudaMemcpyDeviceToHost));
 68 |     GET_TIME(finish);
 69 |     
 70 |     printf("Elapsed time: %.6f msec\n", (finish - start)*1000);
 71 | 
 72 |     const int rows = 4;
 73 |     const int cols = 4;
 74 |     const int img_width = 196;
 75 |     cv::Mat dstImage(img_width * (rows + 1), img_width * (cols + 1), CV_8UC3, cv::Scalar(255, 255, 255));
 76 | 
 77 |     int max_points = 0;
 78 |     const int numberOfdisplay = 16;
 79 |     for (int i = 0; i < numberOfdisplay; i++) {
 80 |         const int r = i / cols;
 81 |         const int c = i % cols;
 82 |         for (int j = 0; j < 2; j++) {
 83 |             cv::line(dstImage,
 84 |                     cv::Point((r*img_width) + ((img_width/4) + bLines_h[i].CP[j].x*img_width), (c*img_width) + ((img_width/4) + bLines_h[i].CP[j].y*img_width)),
 85 |                     cv::Point((r*img_width) + ((img_width/4) + bLines_h[i].CP[j+1].x*img_width), (c*img_width) + ((img_width/4) + bLines_h[i].CP[j+1].y*img_width)),
 86 |                     cv::Scalar(0,0,0), 2);
 87 |         }
 88 | 
 89 |         if (bLines_h[i].nVertices > max_points)
 90 |             max_points = bLines_h[i].nVertices;
 91 |     }
 92 | 
 93 | 
 94 |     for (int k = 0; k < max_points - 1; k++) {
 95 |         for (int i = 0; i < numberOfdisplay; i++) {
 96 |             const int r = i / cols;
 97 |             const int c = i % cols;
 98 | 
 99 |             if (k < bLines_h[i].nVertices - 1) {
100 |                 cv::line(dstImage,
101 |                         cv::Point(r*img_width + ((img_width/4) + bLines_h[i].vertexPos[k].x*img_width), c*img_width + ((img_width/4) + bLines_h[i].vertexPos[k].y*img_width)),
102 |                         cv::Point(r*img_width + ((img_width/4) + bLines_h[i].vertexPos[k+1].x*img_width), c*img_width + ((img_width/4) + bLines_h[i].vertexPos[k+1].y*img_width)),
103 |                         cv::Scalar(255,0,0), 2);
104 |             };
105 |         }
106 |         cv::imshow("win", dstImage);
107 |         cv::waitKey(500);
108 |     }
109 |     cv::waitKey(0);
110 | 
111 |     CUDA_CHECK(cudaFree(bLines_d));
112 |     delete[] bLines_h;
113 | 
114 |     return 0;
115 | }


--------------------------------------------------------------------------------
/CUDA/common/common_string.h:
--------------------------------------------------------------------------------
  1 | #ifndef __COMMON_STRING_H__
  2 | #define __COMMON_STRING_H__
  3 | 
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | 
  7 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  8 | #define strncasecmp _strnicmp
  9 | #define strcasecmp strcmpi
 10 | #endif
 11 | 
 12 | inline int stringRemoveDelimiter(char delimiter, const char* string)
 13 | {
 14 |     int str_start = 0;
 15 | 
 16 |     while (string[str_start] == delimiter) {
 17 |         str_start++;
 18 |     }
 19 | 
 20 |     if (str_start >= static_cast<int>(strlen(string))) {
 21 |         return 0;
 22 |     }
 23 | 
 24 |     return str_start;
 25 | }
 26 | 
 27 | inline bool checkCmdLineFlag(int argc, const char** argv, const char* str_ref)
 28 | {
 29 |     bool found = false;
 30 | 
 31 |     if (argc >= 1) {
 32 |         for (int i = 1; i < argc; i++) {
 33 |             int str_start = stringRemoveDelimiter('-', argv[i]);
 34 |             const char *str_argv = &argv[i][str_start];
 35 |             const char *equal_pos = strchr(str_argv, '=');
 36 | 
 37 |             int argv_length = static_cast<int>(equal_pos == 0 ? strlen(str_argv) : equal_pos - str_argv);
 38 |             int length = static_cast<int>(strlen(str_ref));
 39 | 
 40 |             if (length == argv_length && !strncasecmp(str_argv, str_ref, length)) {
 41 |                 found = true;
 42 |                 continue;
 43 |             }
 44 |         }
 45 |     }
 46 | 
 47 |     return found;
 48 | }
 49 | 
 50 | inline bool getCmdLineArgumentString(int argc, const char** argv, const char* str_ref, char** str_retval)
 51 | {
 52 |     bool found = false;
 53 | 
 54 |     if (argc >= 1) {
 55 |         for (int i = 1; i < argc; i++) {
 56 |             int str_start = stringRemoveDelimiter('-', argv[i]);
 57 |             char* str_argv = const_cast<char *>(&argv[i][str_start]);
 58 |             int length = static_cast<int>(strlen(str_ref));
 59 | 
 60 |             if (!strncasecmp(str_argv, str_ref, length)) {
 61 |                 *str_retval = &str_argv[length + 1];
 62 |                 found = true;
 63 |                 continue;
 64 |             }
 65 |         }
 66 |     }
 67 | 
 68 |     if (!found)
 69 |         *str_retval = NULL;
 70 | 
 71 |     return found;
 72 | }
 73 | 
 74 | inline int getCmdLineArgumentInt(int argc, const char** argv, const char* str_ref)
 75 | {
 76 |     bool found = false;
 77 |     int value = -1;
 78 | 
 79 |     if (argc >= 1) {
 80 |         for (int i = 1; i < argc; i++) {
 81 |             int str_start = stringRemoveDelimiter('-', argv[i]);
 82 |             char* str_argv = const_cast<char *>(&argv[i][str_start]);
 83 |             int length = static_cast<int>(strlen(str_ref));
 84 | 
 85 |             if (!strncasecmp(str_argv, str_ref, length)) {
 86 |                 if (length + 1 <= static_cast<int>(strlen(str_argv))) {
 87 |                     int auto_inc = (str_argv[length] == '=') ? 1 : 0;
 88 |                     value = strtol(&str_argv[length + auto_inc], NULL, 10);
 89 |                 }
 90 |                 else {
 91 |                     value = 0;
 92 |                 }
 93 |             }
 94 | 
 95 |             found = true;
 96 |             continue;
 97 |         }
 98 |     }
 99 | 
100 |     if (found)
101 |         return value;
102 |     else
103 |         return 0;
104 | }
105 | 
106 | #endif


--------------------------------------------------------------------------------
/CUDA/deviceQuery/simpleDeviceQuery.cu:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        simpleDeviceQuery.cu
 3 |  * Description: Query device information
 4 |  *              
 5 |  * Compile:     nvcc -o simpleDeviceQuery simpleDeviceQuery.cu
 6 |  * Run:         ./simpleDeviceQuery
 7 |  *****************************************************************************/
 8 | #include <stdio.h>
 9 | #include <cuda_runtime.h>
10 | 
11 | int main(int argc, char** argv)
12 | {
13 |     int dev = 0;
14 |     cudaDeviceProp devProp;
15 |     cudaGetDeviceProperties(&devProp, dev);
16 | 
17 |     printf("Device %d: %s\n", dev, devProp.name);
18 |     printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
19 |     printf("Total amount of constant memory: %4.2f KB\n", devProp.totalConstMem/1024.0);
20 |     printf("Total amount of shared memory per block: %4.2f KB\n", devProp.sharedMemPerBlock/1024.0);
21 |     printf("Total number of registers available per block: %d\n", devProp.regsPerBlock);
22 |     printf("Warp size: %d\n", devProp.warpSize);
23 |     printf("Maximum number of threads per multiprocessor: %d\n", devProp.maxThreadsPerMultiProcessor);
24 |     printf("Maximum number of warps per multiprocessor: %d\n", devProp.maxThreadsPerMultiProcessor/devProp.warpSize);
25 | 
26 |     return 0;
27 | }


--------------------------------------------------------------------------------
/CUDA/imageProcessing/convertColorToGrey.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        convertColorToGrey.cu
  3 |  * Description: Convert color scale to grey scale of input image.
  4 |  *              This program doesn't save result image, and just show the result.
  5 |  *              For reading image, OpenCV library should be used.
  6 |  *              
  7 |  * Compile:     nvcc -o convertColorToGrey convertColorToGrey.cu -I.. -lcuda $(pkg-config opencv4 --libs --cflags)
  8 |  * Run:         ./convertColorToGrey <image file path>
  9 |  *****************************************************************************/
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <common/common.h>
 13 | 
 14 | #include <cuda_runtime.h>
 15 | #include <opencv2/highgui.hpp>
 16 | #include <opencv2/imgproc.hpp>
 17 | 
 18 | #define CHANNELS 3
 19 | 
 20 | void Usage(char prog_name[]);
 21 | __global__
 22 | void colorToGreyscaleConversion(unsigned char* in, unsigned char* out, int width, int height);
 23 | 
 24 | int main(int argc, char** argv)
 25 | {
 26 |     if (argc != 2) {
 27 |         Usage(argv[0]);
 28 |     }
 29 |     
 30 |     const char* file_name = argv[1];
 31 |     int width, height, channels;
 32 |     unsigned char *h_origImg, *h_resultImg;
 33 |     // open image file
 34 |     cv::Mat origImg = cv::imread(file_name);
 35 | 
 36 |     width = origImg.cols;
 37 |     height = origImg.rows;
 38 |     channels = origImg.channels();
 39 |     printf("Image size = (%d x %d x %d)\n", width, height, channels);
 40 |     assert(channels == CHANNELS);
 41 |     
 42 |     cv::Mat half;
 43 |     cv::resize(origImg, half, cv::Size(width/2, height/2));
 44 |     cv::imshow("image", half);
 45 |     cv::waitKey(0);
 46 | 
 47 |     h_origImg = (unsigned char*)malloc(width * height * channels * sizeof(unsigned char));
 48 |     h_resultImg = (unsigned char*)malloc(width * height * sizeof(unsigned char));
 49 |     (void)memcpy(h_origImg, origImg.data, width * height * channels);
 50 | 
 51 |     unsigned char *d_origImg, *d_resultImg;
 52 |     CUDA_CHECK(cudaMalloc((void**)&d_origImg, width * height * channels * sizeof(unsigned char)));
 53 |     CUDA_CHECK(cudaMalloc((void**)&d_resultImg, width * height * sizeof(unsigned char)));
 54 | 
 55 |     // Copy the host input in host memory to the device input in device memory
 56 |     CUDA_CHECK(cudaMemcpy(d_origImg, h_origImg, width * height * channels * sizeof(unsigned char), cudaMemcpyHostToDevice));
 57 | 
 58 |     // Launch the Kernel
 59 |     const int block_size = 16;
 60 |     dim3 threads(block_size, block_size);
 61 |     dim3 grid(ceil(width / (double)threads.x), ceil(height / (double)threads.y));
 62 |     colorToGreyscaleConversion<<<grid, threads>>>(d_origImg, d_resultImg, width, height);
 63 |     
 64 |     // Copy the device result in device memory to the host result in host memory
 65 |     CUDA_CHECK(cudaMemcpy(h_resultImg, d_resultImg, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost));
 66 |     
 67 |     cv::Mat resultImg(height, width, CV_8UC1);
 68 |     memcpy(resultImg.data, h_resultImg, width * height);
 69 | 
 70 |     // Free device global memory
 71 |     CUDA_CHECK(cudaFree(d_origImg));
 72 |     CUDA_CHECK(cudaFree(d_resultImg));
 73 | 
 74 |     // Free host memory
 75 |     free(h_origImg);
 76 |     free(h_resultImg);
 77 | 
 78 |     // show result
 79 |     //cv::Mat resizeImg;
 80 |     cv::resize(resultImg, resultImg, cv::Size(width/2, height/2));
 81 |     cv::imshow("image", resultImg);
 82 |     cv::waitKey(0);
 83 | 
 84 |     return 0;
 85 | }
 86 | 
 87 | void Usage(char prog_name[])
 88 | {
 89 |     fprintf(stderr, "Usage: %s <image file path>\n", prog_name);
 90 |     exit(EXIT_FAILURE);
 91 | }
 92 | 
 93 | // Input image has 3 channels corresponding to RGB
 94 | // The input image is encoded as unsigned characters [0, 255]
 95 | __global__
 96 | void colorToGreyscaleConversion(unsigned char* in, unsigned char* out, int width, int height)
 97 | {
 98 |     int Row = blockIdx.y * blockDim.y + threadIdx.y;
 99 |     int Col = blockIdx.x * blockDim.x + threadIdx.x;
100 | 
101 |     if (Row < height && Col < width) {
102 |         int offset = Row*width + Col;
103 |         int rgbOffset = offset*CHANNELS;
104 |         
105 |         unsigned char r = in[rgbOffset];       // red value for pixel
106 |         unsigned char g = in[rgbOffset + 1];   // green value for pixel
107 |         unsigned char b = in[rgbOffset + 2];   // blue value for pixel
108 | 
109 |         out[offset] = 0.21f * r + 0.71f * g + 0.07f * b;
110 |     }
111 | }


--------------------------------------------------------------------------------
/CUDA/imageProcessing/imageBlur.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        imageBlur.cu
  3 |  * Description: Blur input image using 3D blocks.
  4 |  *              This program doesn't save result image, and just show the result.
  5 |  *              For reading image, OpenCV library should be used.
  6 |  *              
  7 |  * Compile:     nvcc -o imageBlur imageBlur.cu -I.. -lcuda $(pkg-config opencv4 --libs --cflags)
  8 |  * Run:         ./imageBlur <image file path>
  9 |  *****************************************************************************/
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <common/common.h>
 13 | 
 14 | #include <cuda_runtime.h>
 15 | #include <opencv2/highgui.hpp>
 16 | #include <opencv2/imgproc.hpp>
 17 | 
 18 | #define CHANNELS 3
 19 | #define BLUR_SIZE 10
 20 | 
 21 | void Usage(char prog_name[]);
 22 | __global__
 23 | void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int channel);
 24 | 
 25 | int main(int argc, char** argv)
 26 | {
 27 |     if (argc != 2) {
 28 |         Usage(argv[0]);
 29 |     }
 30 |     
 31 |     const char* file_name = argv[1];
 32 |     int width, height, channels;
 33 |     unsigned char *h_origImg, *h_resultImg;
 34 |     // open image file
 35 |     cv::Mat origImg = cv::imread(file_name);
 36 | 
 37 |     width = origImg.cols;
 38 |     height = origImg.rows;
 39 |     channels = origImg.channels();
 40 |     printf("Image size = (%d x %d x %d)\n", width, height, channels);
 41 |     assert(channels == CHANNELS);
 42 |     
 43 |     cv::Mat half;
 44 |     cv::resize(origImg, half, cv::Size(width/2, height/2));
 45 |     cv::imshow("image", half);
 46 |     cv::waitKey(0);
 47 | 
 48 |     h_origImg = (unsigned char*)malloc(width * height * channels * sizeof(unsigned char));
 49 |     h_resultImg = (unsigned char*)malloc(width * height * channels * sizeof(unsigned char));
 50 |     (void)memcpy(h_origImg, origImg.data, width * height * channels);
 51 | 
 52 |     unsigned char *d_origImg, *d_resultImg;
 53 |     CUDA_CHECK(cudaMalloc((void**)&d_origImg, width * height * channels * sizeof(unsigned char)));
 54 |     CUDA_CHECK(cudaMalloc((void**)&d_resultImg, width * height * channels * sizeof(unsigned char)));
 55 | 
 56 |     // Copy the host input in host memory to the device input in device memory
 57 |     CUDA_CHECK(cudaMemcpy(d_origImg, h_origImg, width * height * channels * sizeof(unsigned char), cudaMemcpyHostToDevice));
 58 | 
 59 |     // Launch the blur Kernel
 60 |     const int block_size = 16;
 61 |     dim3 threads(block_size, block_size, channels);
 62 |     dim3 grid(ceil(width / (double)threads.x), ceil(height / (double)threads.y));
 63 |     blurKernel<<<grid, threads>>>(d_origImg, d_resultImg, width, height, channels);
 64 |     
 65 |     // Copy the device result in device memory to the host result in host memory
 66 |     CUDA_CHECK(cudaMemcpy(h_resultImg, d_resultImg, width * height * channels * sizeof(unsigned char), cudaMemcpyDeviceToHost));
 67 |     
 68 |     cv::Mat resultImg(height, width, CV_8UC3);
 69 |     memcpy(resultImg.data, h_resultImg, width * height * channels);
 70 | 
 71 |     // Free device global memory
 72 |     CUDA_CHECK(cudaFree(d_origImg));
 73 |     CUDA_CHECK(cudaFree(d_resultImg));
 74 | 
 75 |     // Free host memory
 76 |     free(h_origImg);
 77 |     free(h_resultImg);
 78 | 
 79 |     // show result
 80 |     //cv::Mat resizeImg;
 81 |     cv::resize(resultImg, resultImg, cv::Size(width/2, height/2));
 82 |     cv::imshow("image", resultImg);
 83 |     cv::waitKey(0);
 84 | 
 85 |     return 0;
 86 | }
 87 | 
 88 | void Usage(char prog_name[])
 89 | {
 90 |     fprintf(stderr, "Usage: %s <image file path>\n", prog_name);
 91 |     exit(EXIT_FAILURE);
 92 | }
 93 | 
 94 | __global__
 95 | void blurKernel(unsigned char* in, unsigned char* out, int width, int height, int channel)
 96 | {
 97 |     int Plane = blockIdx.z * blockDim.z + threadIdx.z;
 98 |     int Row = blockIdx.y * blockDim.y + threadIdx.y;
 99 |     int Col = blockIdx.x * blockDim.x + threadIdx.x;
100 | 
101 |     if (Row < height && Col < width && Plane < channel) {
102 |         int pixelVal = 0;
103 |         int pixelCnt = 0;
104 | 
105 |         for (int bRow = -BLUR_SIZE; bRow < BLUR_SIZE; bRow++) {
106 |             for (int bCol = -BLUR_SIZE; bCol < BLUR_SIZE; bCol++) {
107 |                 int curRow = Row + bRow;
108 |                 int curCol = Col + bCol;
109 | 
110 |                 if (curRow >= 0 && curRow < height && curCol >= 0 && curCol < width) {
111 |                     pixelVal += in[(curRow * width + curCol) * channel + Plane];
112 |                     pixelCnt++;
113 |                 }
114 |             }
115 |         }
116 | 
117 |         out[(Row * width + Col) * channel + Plane] = (unsigned char)(pixelVal / pixelCnt);
118 |     }
119 | }


--------------------------------------------------------------------------------
/CUDA/imageProcessing/lena.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/junstar92/parallel_programming_study/9886cc1e6f630c7b89b402ad2ffa60653ce8edfa/CUDA/imageProcessing/lena.jpg


--------------------------------------------------------------------------------
/CUDA/matrixAdd/matrixAdd.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        matrixAdd.cu
  3 |  * Description: Matrix addition, C = A + B
  4 |  *              A,B and C have m x n dimensions.
  5 |  *              
  6 |  * Compile:     nvcc -o matrixAdd matrixAdd.cu -I.. -lcuda
  7 |  * Run:         ./matrixAdd <m> <n>
  8 |  *                  <m> : the number of rows in Matrix A, B
  9 |  *                  <n> : the number of columns in Matrix A, B.
 10 |  *****************************************************************************/
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <cuda_runtime.h>
 14 | #include <common/common.h>
 15 | 
 16 | void Usage(char prog_name[]);
 17 | __global__ void matrixAdd(const float *A, const float *B, float *C, const int M, const int N);
 18 | 
 19 | int main(int argc, char* argv[])
 20 | {
 21 |     if (argc != 3) {
 22 |         Usage(argv[0]);
 23 |     }
 24 | 
 25 |     int m = strtol(argv[1], NULL, 10);
 26 |     int n = strtol(argv[2], NULL, 10);
 27 |     printf("[Matrix addition, C = A + B]\n");
 28 |     printf("\tA, B, and C are (%d x %d) matrix\n", m, n);
 29 | 
 30 |     // Allocate the host matrix A, B, C
 31 |     float *h_A = (float*)malloc(m * n * sizeof(float));
 32 |     float *h_B = (float*)malloc(m * n * sizeof(float));
 33 |     float *h_C = (float*)malloc(m * n * sizeof(float));
 34 | 
 35 |     // Verify that allocations succeeded
 36 |     if (h_A == NULL || h_B == NULL || h_C == NULL) {
 37 |         fprintf(stderr, "Failed to allocate host matrix!\n");
 38 |         exit(EXIT_SUCCESS);
 39 |     }
 40 |     
 41 |     // Initialize that host matrix
 42 |     common_random_init_matrix<float>(h_A, m, n);
 43 |     common_random_init_matrix<float>(h_B, m, n);
 44 | 
 45 |     // Allocate the device matrix A, B, C
 46 |     float *d_A, *d_B, *d_C;
 47 |     CUDA_CHECK(cudaMalloc((void**)&d_A, m * n * sizeof(float)));
 48 |     CUDA_CHECK(cudaMalloc((void**)&d_B, m * n * sizeof(float)));
 49 |     CUDA_CHECK(cudaMalloc((void**)&d_C, m * n * sizeof(float)));
 50 | 
 51 |     // Copy the host input matrix A and B in host memory 
 52 |     // to the device input matrix in device memory
 53 |     printf("Copy input data from the host memory to the CUDA device\n");
 54 |     CUDA_CHECK(cudaMemcpy(d_A, h_A, m * n * sizeof(float), cudaMemcpyHostToDevice));
 55 |     CUDA_CHECK(cudaMemcpy(d_B, h_B, m * n * sizeof(float), cudaMemcpyHostToDevice));
 56 | 
 57 |     // Allocate CUDA events for estimating
 58 |     cudaEvent_t start, stop;
 59 |     CUDA_CHECK(cudaEventCreate(&start));
 60 |     CUDA_CHECK(cudaEventCreate(&stop));
 61 | 
 62 |     // Launch the Matrix Add CUDA Kernel
 63 |     const int block_size = 16;
 64 |     dim3 threads(block_size, block_size);
 65 |     dim3 grid(ceil(m / (float)threads.x), ceil(n / (float)threads.y));
 66 |     printf("CUDA kernel launch with (%d x %d) blocks of (%d x %d) threads\n", grid.x, grid.y, threads.x, threads.y);
 67 | 
 68 |     CUDA_CHECK(cudaDeviceSynchronize());
 69 |     CUDA_CHECK(cudaEventRecord(start));
 70 | 
 71 |     matrixAdd<<<grid, threads>>>(d_A, d_B, d_C, m, n);
 72 | 
 73 |     CUDA_CHECK(cudaDeviceSynchronize());
 74 |     CUDA_CHECK(cudaEventRecord(stop));
 75 | 
 76 | 
 77 |     // Copy the device result matrix in device memory
 78 |     // to the host result matrix in host memory
 79 |     printf("Copy output data from the CUDA device to the host memory\n");
 80 |     CUDA_CHECK(cudaMemcpy(h_C, d_C, m * n * sizeof(float), cudaMemcpyDeviceToHost));
 81 | 
 82 |     // Verify that the result matrix is correct
 83 |     common_verify_matAdd(h_A, h_B, h_C, m, n);
 84 |     
 85 |     // Compute and Print the performance
 86 |     COMPUTE_MATADD_PERFORMANCE(start, stop, m, n, threads.x * threads.y);
 87 |     
 88 |     // Free device global memory
 89 |     CUDA_CHECK(cudaFree(d_A));
 90 |     CUDA_CHECK(cudaFree(d_B));
 91 |     CUDA_CHECK(cudaFree(d_C));
 92 |     CUDA_CHECK(cudaEventDestroy(start));
 93 |     CUDA_CHECK(cudaEventDestroy(stop));
 94 | 
 95 |     // Free host memory
 96 |     free(h_A);
 97 |     free(h_B);
 98 |     free(h_C);
 99 | 
100 |     printf("Done\n");
101 | 
102 |     return 0;
103 | }
104 | 
105 | void Usage(char prog_name[])
106 | {
107 |     fprintf(stderr, "Usage: %s <m> <n>\n", prog_name);
108 |     fprintf(stderr, "\t<m> : the number of rows in matrix A, B.\n");
109 |     fprintf(stderr, "\t<n> : the number of columns in matrix A, B.\n");
110 |     exit(EXIT_FAILURE);
111 | }
112 | 
113 | __global__
114 | void matrixAdd(const float *A, const float *B, float *C, const int M, const int N)
115 | {
116 |     int ROW = blockIdx.x * blockDim.x + threadIdx.x;
117 |     int COL = blockIdx.y * blockDim.y + threadIdx.y;
118 | 
119 |     if (ROW < M && COL < N) {
120 |         C[(ROW * N) + COL] = A[(ROW * N) + COL] + B[(ROW * N) + COL];
121 |     }
122 | }


--------------------------------------------------------------------------------
/CUDA/matrixAdd/matrixAdd2.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        matrixAdd2.cu
  3 |  * Description: Matrix addition, C = A + B
  4 |  *              A,B and C have 2^14 x 2^14 dimensions.
  5 |  *              
  6 |  * Compile:     nvcc -O3 -o matrixAdd2 matrixAdd2.cu -I..
  7 |  * Run:         ./matrixAdd2 
  8 |  *****************************************************************************/
  9 | #include <stdio.h>
 10 | #include <common/common.h>
 11 | #include <cuda_runtime.h>
 12 | 
 13 | void initialData(float* p, const int size)
 14 | {
 15 |     for (int i = 0; i < size; i++) {
 16 |         p[i] = (float)(rand() & 0xFF) / 10.0f;
 17 |     }
 18 | }
 19 | 
 20 | void sumMatrixOnHost(float* A, float* B, float* C, const int nx, const int ny)
 21 | {
 22 |     for (int y = 0; y < ny; y++) {
 23 |         for (int x = 0; x < nx; x++) {
 24 |             C[y*nx + x] = A[y*nx + x] + B[y*nx + x];
 25 |         }
 26 |     }
 27 | }
 28 | 
 29 | void checkResult(float* hostRef, float* gpuRef, const int N)
 30 | {
 31 |     double epsilon = 1.0e-8;
 32 | 
 33 |     for (int i = 0; i < N; i++) {
 34 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
 35 |             printf("host %f gpu %f ", hostRef[i], gpuRef[i]);
 36 |             printf("Arrays do not match.\n\n");
 37 |             break;
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | // grid 2D block 2D
 43 | __global__
 44 | void sumMatrixOnGPU2D(float* A, float* B, float* C, int NX, int NY)
 45 | {
 46 |     unsigned int ix = blockDim.x*blockIdx.x + threadIdx.x;
 47 |     unsigned int iy = blockDim.y*blockIdx.y + threadIdx.y;
 48 |     unsigned int idx = iy*NX + ix;
 49 | 
 50 |     if (ix < NX && iy < NY) {
 51 |         C[idx] = A[idx] + B[idx];
 52 |     }
 53 | }
 54 | 
 55 | int main(int argc, char** argv)
 56 | {
 57 |     // setup device
 58 |     int dev = 0;
 59 |     cudaDeviceProp devProp;
 60 |     CUDA_CHECK(cudaGetDeviceProperties(&devProp, dev));
 61 |     CUDA_CHECK(cudaSetDevice(dev));
 62 | 
 63 |     // setup data size of matrix
 64 |     int nx = 1 << 14;
 65 |     int ny = 1 << 14;
 66 |     int nxy = nx * ny;
 67 |     int nBytes = nxy * sizeof(float);
 68 | 
 69 |     // malloc host memory
 70 |     float *h_A, *h_B, *hostRef, *gpuRef;
 71 |     h_A = (float*)malloc(nBytes);
 72 |     h_B = (float*)malloc(nBytes);
 73 |     hostRef = (float*)malloc(nBytes);
 74 |     gpuRef = (float*)malloc(nBytes);
 75 | 
 76 |     // initialize data at host
 77 |     initialData(h_A, nxy);
 78 |     initialData(h_B, nxy);
 79 |     memset(hostRef, 0, nBytes);
 80 |     memset(gpuRef, 0, nBytes);
 81 | 
 82 |     double start, finish;
 83 |     // add matrix at host for result
 84 |     GET_TIME(start);
 85 |     //sumMatrixOnHost(h_A, h_B, hostRef, nx, ny);
 86 |     GET_TIME(finish);
 87 |     //printf("sumMatrixOnHost elapsed %f ms\n", (finish-start)*1000.f);
 88 | 
 89 |     // malloc device global memory
 90 |     float *d_A, *d_B, *d_C;
 91 |     CUDA_CHECK(cudaMalloc((void**)&d_A, nBytes));
 92 |     CUDA_CHECK(cudaMalloc((void**)&d_B, nBytes));
 93 |     CUDA_CHECK(cudaMalloc((void**)&d_C, nBytes));
 94 | 
 95 |     // transfer data from host to device
 96 |     CUDA_CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
 97 |     CUDA_CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
 98 | 
 99 |     // invoke kernel at host
100 |     int dimx = 32;
101 |     int dimy = 32;
102 | 
103 |     if (argc > 2) {
104 |         dimx = atoi(argv[1]);
105 |         dimy = atoi(argv[2]);
106 |     }
107 | 
108 |     dim3 block(dimx, dimy);
109 |     dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
110 | 
111 |     // execute the kernel
112 |     CUDA_CHECK(cudaDeviceSynchronize());
113 |     GET_TIME(start);
114 |     sumMatrixOnGPU2D<<<grid, block>>>(d_A, d_B, d_C, nx, ny);
115 |     CUDA_CHECK(cudaDeviceSynchronize());
116 |     GET_TIME(finish);
117 |     printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f ms\n", grid.x, grid.y, block.x, block.y, (finish-start)*1000.f);
118 |     CUDA_CHECK(cudaGetLastError());
119 | 
120 |     // copy kernel result back to host
121 |     CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
122 | 
123 |     // check device result
124 |     //checkResult(hostRef, gpuRef, nxy);
125 | 
126 |     // free device global memory
127 |     CUDA_CHECK(cudaFree(d_A));
128 |     CUDA_CHECK(cudaFree(d_B));
129 |     CUDA_CHECK(cudaFree(d_C));
130 | 
131 |     // free host memory
132 |     free(h_A);
133 |     free(h_B);
134 |     free(hostRef);
135 |     free(gpuRef);
136 | 
137 |     // reset device
138 |     CUDA_CHECK(cudaDeviceReset());
139 | 
140 |     return 0;
141 | }


--------------------------------------------------------------------------------
/CUDA/matrixMul/matrixMul.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        matrixMul.cu
  3 |  * Description: Matrix multiplication, C = AB
  4 |  *              A has m x k dimensions, B has k x n dimensions, and C has
  5 |  *              m x n dimensions.
  6 |  *              It is not for the most performance.
  7 |  *              
  8 |  * Compile:     nvcc -o matrixMul matrixMul.cu -I.. -lcuda
  9 |  * Run:         ./matrixMul <m> <k> <n>
 10 |  *                  <m> : the number of rows in Matrix A
 11 |  *                  <k> : the number of columns in Matrix A, it is also
 12 |  *                        the number of rows in Matrix B.
 13 |  *                  <n> : the number of columns in Matrix B.
 14 |  *****************************************************************************/
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <cuda_runtime.h>
 18 | #include <common/common.h>
 19 | 
 20 | void Usage(char prog_name[]);
 21 | __global__ void matrixMul(const float *A, const float *B, float *C, const int M, const int K, const int N);
 22 | 
 23 | int main(int argc, char* argv[])
 24 | {
 25 |     if (argc != 4) {
 26 |         Usage(argv[0]);
 27 |     }
 28 | 
 29 |     int m = strtol(argv[1], NULL, 10);
 30 |     int k = strtol(argv[2], NULL, 10);
 31 |     int n = strtol(argv[3], NULL, 10);
 32 |     printf("[Matrix multiplication, C = AB]\n");
 33 |     printf("\tA is (%d x %d) matrix, B is (%d x %d) matrix, and \n", m, k, k, n);
 34 |     printf("\tC is (%d x %d) matrix.\n", m, n);
 35 | 
 36 |     // Allocate the host matrix A, B, C
 37 |     float *h_A = (float*)malloc(m * k * sizeof(float));
 38 |     float *h_B = (float*)malloc(k * n * sizeof(float));
 39 |     float *h_C = (float*)malloc(m * n * sizeof(float));
 40 | 
 41 |     // Verify that allocations succeeded
 42 |     if (h_A == NULL || h_B == NULL || h_C == NULL) {
 43 |         fprintf(stderr, "Failed to allocate host matrix!\n");
 44 |         exit(EXIT_SUCCESS);
 45 |     }
 46 |     
 47 |     // Initialize that host matrix
 48 |     common_random_init_matrix<float>(h_A, m, k);
 49 |     common_random_init_matrix<float>(h_B, k, n);
 50 | 
 51 |     // Allocate the device matrix A, B, C
 52 |     float *d_A, *d_B, *d_C;
 53 |     CUDA_CHECK(cudaMalloc((void**)&d_A, m * k * sizeof(float)));
 54 |     CUDA_CHECK(cudaMalloc((void**)&d_B, k * n * sizeof(float)));
 55 |     CUDA_CHECK(cudaMalloc((void**)&d_C, m * n * sizeof(float)));
 56 | 
 57 |     // Copy the host input matrix A and B in host memory 
 58 |     // to the device input matrix in device memory
 59 |     printf("Copy input data from the host memory to the CUDA device\n");
 60 |     CUDA_CHECK(cudaMemcpy(d_A, h_A, m * k * sizeof(float), cudaMemcpyHostToDevice));
 61 |     CUDA_CHECK(cudaMemcpy(d_B, h_B, k * n * sizeof(float), cudaMemcpyHostToDevice));
 62 | 
 63 |     // Allocate CUDA events for estimating
 64 |     cudaEvent_t start, stop;
 65 |     CUDA_CHECK(cudaEventCreate(&start));
 66 |     CUDA_CHECK(cudaEventCreate(&stop));
 67 | 
 68 |     // Launch the Matrix Multiplication CUDA Kernel
 69 |     const int block_size = 16;
 70 |     dim3 threads(block_size, block_size);
 71 |     dim3 grid(ceil(m / (float)threads.x), ceil(n / (float)threads.y));
 72 |     printf("CUDA kernel launch with (%d x %d) blocks of (%d x %d) threads\n", grid.x, grid.y, threads.x, threads.y);
 73 | 
 74 |     CUDA_CHECK(cudaDeviceSynchronize());
 75 |     CUDA_CHECK(cudaEventRecord(start));
 76 | 
 77 |     matrixMul<<<grid, threads>>>(d_A, d_B, d_C, m, k, n);
 78 | 
 79 |     CUDA_CHECK(cudaDeviceSynchronize());
 80 |     CUDA_CHECK(cudaEventRecord(stop));
 81 | 
 82 | 
 83 |     // Copy the device result matrix in device memory
 84 |     // to the host result matrix in host memory
 85 |     printf("Copy output data from the CUDA device to the host memory\n");
 86 |     CUDA_CHECK(cudaMemcpy(h_C, d_C, m * n * sizeof(float), cudaMemcpyDeviceToHost));
 87 | 
 88 |     // Verify that the result matrix is correct (L2-norm error)
 89 |     common_verify_matMul_l2ne(h_A, h_B, h_C, m, k, n);
 90 |     
 91 |     // Compute and Print the performance
 92 |     COMPUTE_MATMUL_PERFORMANCE(start, stop, m, k, n, threads.x * threads.y);
 93 |     
 94 |     // Free device global memory
 95 |     CUDA_CHECK(cudaFree(d_A));
 96 |     CUDA_CHECK(cudaFree(d_B));
 97 |     CUDA_CHECK(cudaFree(d_C));
 98 |     CUDA_CHECK(cudaEventDestroy(start));
 99 |     CUDA_CHECK(cudaEventDestroy(stop));
100 | 
101 |     // Free host memory
102 |     free(h_A);
103 |     free(h_B);
104 |     free(h_C);
105 | 
106 |     printf("Done\n");
107 | 
108 |     return 0;
109 | }
110 | 
111 | void Usage(char prog_name[])
112 | {
113 |     fprintf(stderr, "Usage: %s <m> <k> <n>\n", prog_name);
114 |     fprintf(stderr, "\t<m> : the number of rows in matrix A.\n");
115 |     fprintf(stderr, "\t<k> : the number of columns in Matrix A, it is also\n");
116 |     fprintf(stderr, "\t      the number of rows in Matrix B.\n");
117 |     fprintf(stderr, "\t<n> : the number of columns in matrix B.\n");
118 |     exit(EXIT_FAILURE);
119 | }
120 | 
121 | __global__
122 | void matrixMul(const float *A, const float *B, float *C, const int M, const int K, const int N)
123 | {
124 |     int Row = blockIdx.y * blockDim.y + threadIdx.y;
125 |     int Col = blockIdx.x * blockDim.x + threadIdx.x;
126 | 
127 |     if (Row < M && Col < N) {
128 |         float value = 0.0;
129 |         for (int i = 0; i < K; i++) {
130 |             value += A[(Row * K) + i] * B[(N * i) + Col];
131 |         }
132 |         C[(Row * N) + Col] = value;
133 |     }
134 | }


--------------------------------------------------------------------------------
/CUDA/reduction/reduction.h:
--------------------------------------------------------------------------------
1 | #ifndef __REDUCTION_H__
2 | #define __REDUCTION_H__
3 | 
4 | template <class T>
5 | void reduce(int size, int threads, int blocks, int smemSize,
6 |             int whichKernel, T *d_in, T *d_out);
7 | 
8 | #endif


--------------------------------------------------------------------------------
/CUDA/simpleDivergence/simpleDivergence.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        simpleDivergence.cu
  3 |  * Description: Measure the performance of some kernels.
  4 |  *              One has warp divergence and others doesn't have warp divergence.
  5 |  *              
  6 |  * Compile:     nvcc -g -G -arch=sm_75 -o simpleDivergence simpleDivergence.cu -I..
  7 |  * Run:         ./simpleDivergence
  8 |  * Argument:    n.a
  9 |  *****************************************************************************/
 10 | #include <stdio.h>
 11 | #include <cuda_runtime.h>
 12 | #include <common/common.h>
 13 | 
 14 | __global__ void mathKernel1(float* c)
 15 | {
 16 | 	int tid = blockDim.x * blockIdx.x + threadIdx.x;
 17 |     float a, b;
 18 |     a = b = 0.0f;
 19 |     
 20 |     if (tid % 2 == 0) {
 21 |     	a = 100.0f;
 22 |     }
 23 |     else {
 24 |     	b = 200.0f;
 25 |     }
 26 |     c[tid] = a + b;
 27 | }
 28 | 
 29 | __global__ void mathKernel2(float* c)
 30 | {
 31 | 	int tid = blockDim.x * blockIdx.x + threadIdx.x;
 32 |     float a, b;
 33 |     a = b = 0.0f;
 34 |     
 35 |     if ((tid / warpSize) % 2 == 0) {
 36 |     	a = 100.0f;
 37 |     }
 38 |     else {
 39 |     	b = 200.0f;
 40 |     }
 41 |     c[tid] = a + b;
 42 | }
 43 | 
 44 | __global__ void mathKernel3(float* c)
 45 | {
 46 | 	int tid = blockDim.x * blockIdx.x + threadIdx.x;
 47 |     float a, b;
 48 |     a = b = 0.0f;
 49 |     
 50 |     bool pred = (tid % 2 == 0);
 51 | 
 52 |     if (pred) {
 53 |     	a = 100.0f;
 54 |     }
 55 | 
 56 |     if (!pred) {
 57 |     	b = 200.0f;
 58 |     }
 59 | 
 60 |     c[tid] = a + b;
 61 | }
 62 | 
 63 | __global__ void mathKernel4(float* c)
 64 | {
 65 | 	int tid = blockDim.x * blockIdx.x + threadIdx.x;
 66 |     float a, b;
 67 |     a = b = 0.0f;
 68 |     
 69 |     int itid = tid >> 5;
 70 | 
 71 |     if (itid & 0x01 == 0) {
 72 |     	a = 100.0f;
 73 |     }
 74 |     else {
 75 |     	b = 200.0f;
 76 |     }
 77 |     
 78 |     c[tid] = a + b;
 79 | }
 80 | 
 81 | __global__ void warmingup(float *c)
 82 | {
 83 |     int tid = blockDim.x * blockIdx.x + threadIdx.x;
 84 |     float a, b;
 85 |     a = b = 0.0f;
 86 |     
 87 |     if ((tid / warpSize) % 2 == 0) {
 88 |     	a = 100.0f;
 89 |     }
 90 |     else {
 91 |     	b = 200.0f;
 92 |     }
 93 |     c[tid] = a + b;
 94 | }
 95 | 
 96 | int main(int argc, char** argv)
 97 | {
 98 |     // set up device
 99 |     int dev = 0;
100 |     cudaDeviceProp deviceProp;
101 |     cudaGetDeviceProperties(&deviceProp, dev);
102 |     printf("%s using Device %d: %s\n", argv[0], dev, deviceProp.name);
103 | 
104 |     // set up data size
105 |     int size = 64;
106 |     int blockSize = 64;
107 |     if (argc > 1)
108 |         blockSize = atoi(argv[1]);
109 |     if (argc > 2)
110 |         size = atoi(argv[2]);
111 |     printf("Data size: %d ", size);
112 | 
113 |     // set up execution configuration
114 |     dim3 block(blockSize, 1);
115 |     dim3 grid((size+block.x-1) / block.x, 1);
116 |     printf("Excution Configure (block %d grid %d)\n", block.x, grid.x);
117 | 
118 |     // allocate gpu memory
119 |     float *d_C;
120 |     size_t nBytes = size * sizeof(float);
121 |     cudaMalloc((void**)&d_C, nBytes);
122 | 
123 |     double start, finish;
124 |     // run a warmup kernel to remove overhead
125 |     cudaDeviceSynchronize();
126 |     GET_TIME(start);
127 |     warmingup<<<grid, block>>>(d_C);
128 |     cudaDeviceSynchronize();
129 |     GET_TIME(finish);
130 |     printf("warmup      <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start);
131 | 
132 |     // run kernel 1
133 |     GET_TIME(start);
134 |     mathKernel1<<<grid, block>>>(d_C);
135 |     cudaDeviceSynchronize();
136 |     GET_TIME(finish);
137 |     printf("mathKernel1 <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start);
138 | 
139 |     // run kernel 2
140 |     GET_TIME(start);
141 |     mathKernel2<<<grid, block>>>(d_C);
142 |     cudaDeviceSynchronize();
143 |     GET_TIME(finish);
144 |     printf("mathKernel2 <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start);
145 | 
146 |     // run kernel 3
147 |     GET_TIME(start);
148 |     mathKernel3<<<grid, block>>>(d_C);
149 |     cudaDeviceSynchronize();
150 |     GET_TIME(finish);
151 |     printf("mathKernel3 <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start);
152 | 
153 |     // run kernel 4
154 |     GET_TIME(start);
155 |     mathKernel4<<<grid, block>>>(d_C);
156 |     cudaDeviceSynchronize();
157 |     GET_TIME(finish);
158 |     printf("mathKernel4 <<< %4d %4d >>> elapsed %f sec\n", grid.x, block.x, finish-start);
159 | 
160 | 
161 |     // free gpu memory and reset device
162 |     cudaFree(d_C);
163 |     cudaDeviceReset();
164 | 
165 |     return 0;
166 | }


--------------------------------------------------------------------------------
/CUDA/vectorAdd/vectorAdd.cu:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        vectorAdd.cu
  3 |  * Description: Vector addition, C = A + B
  4 |  *              This code is a basic sample that implements element by element
  5 |  *              vector addition.
  6 |  *              
  7 |  * Compile:     nvcc -o vectorAdd vectorAdd.cu -I.. -lcuda
  8 |  * Run:         ./vectorAdd <n>
  9 |  *                  <n> : the number of elements in vector
 10 |  *****************************************************************************/
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <cuda_runtime.h>
 14 | #include <common/common.h>
 15 | 
 16 | void Usage(char prog_name[]);
 17 | void vecAdd(const float *h_A, const float *h_B, float *h_C, int numElements);
 18 | __global__ void vecAddKernel(const float *A, const float *B, float *C, int numElements);
 19 | 
 20 | int main(int argc, char* argv[])
 21 | {
 22 |     if (argc != 2) {
 23 |         Usage(argv[0]);
 24 |     }
 25 | 
 26 |     int numElements = strtol(argv[1], NULL, 10);
 27 |     printf("[Vector addition of %d elements]\n", numElements);
 28 | 
 29 |     // Allocate the host input vectors A, B, C
 30 |     float *h_A = (float*)malloc(numElements * sizeof(float));
 31 |     float *h_B = (float*)malloc(numElements * sizeof(float));
 32 |     float *h_C = (float*)malloc(numElements * sizeof(float));
 33 | 
 34 |     // Verify that allocations succeeded
 35 |     if (h_A == NULL || h_B == NULL || h_C == NULL) {
 36 |         fprintf(stderr, "Failed to allocate host vectors!\n");
 37 |         exit(EXIT_SUCCESS);
 38 |     }
 39 | 
 40 |     // Initialize that host input vectors
 41 |     common_random_init_vector(h_A, numElements);
 42 |     common_random_init_vector(h_B, numElements);
 43 | 
 44 |     // call vecAdd function
 45 |     vecAdd(h_A, h_B, h_C, numElements);
 46 | 
 47 |     // Free host memory
 48 |     free(h_A);
 49 |     free(h_B);
 50 |     free(h_C);
 51 | 
 52 |     printf("Done\n");
 53 |     return 0;
 54 | }
 55 | 
 56 | void Usage(char prog_name[])
 57 | {
 58 |     fprintf(stderr, "Usage: %s <n>\n", prog_name);
 59 |     fprintf(stderr, "\t<n> : the number of elements in vector\n");
 60 |     exit(EXIT_FAILURE);
 61 | }
 62 | 
 63 | void vecAdd(const float *h_A, const float *h_B, float *h_C, int numElements)
 64 | {
 65 |     // Allocate the device input vectors A, B, C
 66 |     float *d_A, *d_B, *d_C;
 67 |     CUDA_CHECK(cudaMalloc((void**)&d_A, numElements * sizeof(float)));
 68 |     CUDA_CHECK(cudaMalloc((void**)&d_B, numElements * sizeof(float)));
 69 |     CUDA_CHECK(cudaMalloc((void**)&d_C, numElements * sizeof(float)));
 70 | 
 71 |     // Copy the host input vector A and B in host memory 
 72 |     // to the device input vectors in device memory
 73 |     printf("Copy input data from the host memory to the CUDA device\n");
 74 |     CUDA_CHECK(cudaMemcpy(d_A, h_A, numElements * sizeof(float), cudaMemcpyHostToDevice));
 75 |     CUDA_CHECK(cudaMemcpy(d_B, h_B, numElements * sizeof(float), cudaMemcpyHostToDevice));
 76 | 
 77 |     // Allocate CUDA events for estimating
 78 |     cudaEvent_t start, stop;
 79 |     CUDA_CHECK(cudaEventCreate(&start));
 80 |     CUDA_CHECK(cudaEventCreate(&stop));
 81 | 
 82 |     // Launch the Vector Add CUDA Kernel
 83 |     int threadsPerBlock = 256;
 84 |     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 85 |     printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 86 | 
 87 |     CUDA_CHECK(cudaDeviceSynchronize());
 88 |     CUDA_CHECK(cudaEventRecord(start));
 89 |     vecAddKernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
 90 |     CUDA_CHECK(cudaDeviceSynchronize());
 91 |     CUDA_CHECK(cudaEventRecord(stop));
 92 | 
 93 |     CUDA_CHECK(cudaGetLastError());
 94 | 
 95 |     // Copy the device result vector in device memory
 96 |     // to the host result vector in host memory
 97 |     printf("Copy output data from the CUDA device to the host memory\n");
 98 |     CUDA_CHECK(cudaMemcpy(h_C, d_C, numElements * sizeof(float), cudaMemcpyDeviceToHost));
 99 |     
100 |     // Verify that the result vector is correct (sampling)
101 |     printf("Verifying vector addition...\n");
102 |     for (int idx = 0; idx < numElements; idx++) {
103 |         //printf("[INDEX %d] %f + %f = %f\n", idx, h_A[idx], h_B[idx], h_C[idx]);
104 |         if (fabs(h_A[idx] + h_B[idx] - h_C[idx]) > 1e-5) {
105 |             fprintf(stderr, "Result verification failed at element %d\n", idx);
106 |             exit(EXIT_FAILURE);
107 |         }
108 |     }
109 |     printf(".....\n");
110 |     printf("Test PASSED\n");
111 | 
112 |     // Compute and Print the performance
113 |     float msecTotal = 0.0f;
114 |     CUDA_CHECK(cudaEventElapsedTime(&msecTotal, start, stop));
115 |     double flopsPerVecAdd = static_cast<double>(numElements);
116 |     double gigaFlops = (flopsPerVecAdd * 1.0e-9f) / (msecTotal / 1000.0f);
117 |     printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size = %.0f Ops, "
118 |            "WorkgroupSize= %u threads/block\n",
119 |            gigaFlops, msecTotal, flopsPerVecAdd, threadsPerBlock);
120 |     
121 |     // Free device global memory
122 |     CUDA_CHECK(cudaFree(d_A));
123 |     CUDA_CHECK(cudaFree(d_B));
124 |     CUDA_CHECK(cudaFree(d_C));
125 |     CUDA_CHECK(cudaEventDestroy(start));
126 |     CUDA_CHECK(cudaEventDestroy(stop));
127 | }
128 | 
129 | __global__
130 | void vecAddKernel(const float *A, const float *B, float *C, int numElements)
131 | {
132 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
133 | 
134 |     if (i < numElements)
135 |         C[i] = A[i] + B[i];
136 | }


--------------------------------------------------------------------------------
/OpenMP/00_omp_hello.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        00_omp_hello.c
 3 |  * Purpose:     A parallel hello, world program that uses OpenMP
 4 |  * Compile:     gcc -Wall -fopenmp -o 00_omp_hello 00_omp_hello.c
 5 |  * Run:         ./00_omp_hello <number of threads>
 6 |  * 
 7 |  * Input:       none
 8 |  * Output:      A message from each thread
 9 |  *****************************************************************************/
10 | #include <stdio.h>
11 | #include <stdlib.h>
12 | #include <omp.h>
13 | 
14 | /* thread function */
15 | void Hello(void);
16 | 
17 | int main(int argc, char* argv[])
18 | {
19 |     int thread_count = strtol(argv[1], NULL, 10);
20 | 
21 | #pragma omp parallel// num_threads(thread_count)
22 |     Hello();
23 | 
24 |     return 0;
25 | }
26 | 
27 | /*****************************************************************************
28 |  * Function:        Hello
29 |  * Purpose:         Thread function that prints message
30 |  *****************************************************************************/
31 | void Hello(void)
32 | {
33 |     int my_rank = omp_get_thread_num();
34 |     int thread_count = omp_get_num_threads();
35 | 
36 |     printf("Hello from thread %d of %d\n", my_rank, thread_count);
37 | }


--------------------------------------------------------------------------------
/OpenMP/01_omp_hello_errchk.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        01_omp_hello_errchk.c
 3 |  * Purpose:     A parallel hello, world program that uses OpenMP
 4 |  * Compile:     gcc -Wall -fopenmp -o 01_omp_hello_errchk 01_omp_hello_errchk.c
 5 |  * Run:         ./01_omp_hello_errchk <number of threads>
 6 |  * 
 7 |  * Input:       none
 8 |  * Output:      A message from each thread
 9 |  *****************************************************************************/
10 | #include <stdio.h>
11 | #include <stdlib.h>
12 | #ifdef _OPENMP
13 | #include <omp.h>
14 | #endif
15 | 
16 | void Usage(char* prog_name);
17 | void Hello(int thread_count);   /* thread function */
18 | 
19 | int main(int argc, char* argv[])
20 | {
21 | #ifdef _OPENMP
22 |     printf("_OPENMP = %d\n", _OPENMP);
23 | #endif
24 |     int thread_count;
25 | 
26 |     if (argc != 2)
27 |         Usage(argv[0]);
28 |     thread_count = strtol(argv[1], NULL, 10);
29 |     if (thread_count <= 0)
30 |         Usage(argv[0]);
31 | 
32 | #pragma omp parallel num_threads(thread_count)
33 |     Hello(thread_count);
34 | 
35 |     return 0;
36 | }
37 | 
38 | /*****************************************************************************
39 |  * Function:        Usage
40 |  * Purpose:         Print a message indicating how program should be started
41 |  *                  and terminate.
42 |  *****************************************************************************/
43 | void Usage(char* prog_name)
44 | {
45 |     fprintf(stderr, "Usage: %s <thread_count>\n", prog_name);
46 |     fprintf(stderr, "   thread_count should be positive\n");
47 |     exit(0);
48 | }
49 | 
50 | /*****************************************************************************
51 |  * Function:        Hello
52 |  * Purpose:         Thread function that prints message
53 |  *****************************************************************************/
54 | void Hello(int thread_count)
55 | {
56 | #ifdef _OPENMP
57 |     int my_rank = omp_get_thread_num();
58 |     int actual_thread_count = omp_get_num_threads();
59 | #else
60 |     int my_rank = 0;
61 |     int actual_thread_count = 1;
62 | #endif
63 | 
64 |     if (my_rank == 0 && thread_count != actual_thread_count)
65 |         fprintf(stderr, "Number of threads started != %d\n", thread_count);
66 |     printf("Hello from thread %d of %d\n", my_rank, actual_thread_count);
67 | }


--------------------------------------------------------------------------------
/OpenMP/02_omp_trap1.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        02_omp_trap1.c
 3 |  * Purpose:     Estimate definite integral (or area under curve) using 
 4 |  *              trapezoidal rule.
 5 |  * Compile:     gcc -Wall -fopenmp -o 02_omp_trap1 02_omp_trap1.c
 6 |  * Run:         ./02_omp_trap1 <number of threads>
 7 |  * 
 8 |  * Input:       a, b, n
 9 |  * Output:      estimate of integral from a to b of f(x) using n trapezoidals.
10 |  *****************************************************************************/
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | #include <omp.h>
14 | 
15 | void Usage(char* prog_name);
16 | double f(double x); /* function we're integrating */
17 | void Trap(double a, double b, int n, double* global_result_p);
18 | 
19 | int main(int argc, char* argv[])
20 | {
21 |     if (argc != 2)
22 |         Usage(argv[0]);
23 | 
24 |     int thread_count = strtol(argv[1], NULL, 10);
25 |     
26 |     double a, b;
27 |     int n;
28 |     printf("Enter a, b, and n\n");
29 |     scanf("%lf %lf %d", &a, &b, &n);
30 |     if (n % thread_count != 0)
31 |         Usage(argv[0]);
32 | 
33 |     double global_result = 0.0;
34 | #pragma omp parallel num_threads(thread_count)
35 |     Trap(a, b, n, &global_result);
36 | 
37 |     printf("With n = %d trapezoids, our estimate\n", n);
38 |     printf("of the integral from %f to %f = %f\n", a, b, global_result);
39 | 
40 |     return 0;
41 | }
42 | 
43 | /*****************************************************************************
44 |  * Function:        Usage
45 |  * Purpose:         Print a message indicating how program should be started
46 |  *                  and terminate.
47 |  *****************************************************************************/
48 | void Usage(char* prog_name)
49 | {
50 |     fprintf(stderr, "Usage: %s <thread_count>\n", prog_name);
51 |     fprintf(stderr, "   number of trapezoids must be evenly divisible by number of threads\n");
52 |     exit(0);
53 | }
54 | 
55 | /*****************************************************************************
56 |  * Function:        f
57 |  * Purpose:         Compute value of function to be integrated
58 |  * Input arg:       x
59 |  * Return val:      f(x)
60 |  *****************************************************************************/
61 | double f(double x)
62 | {
63 |     return x*x;
64 | }
65 | 
66 | /*****************************************************************************
67 |  * Function:        Trap
68 |  * Purpose:         Use trapezoidal rule to estimate definite integral
69 |  * Input arg:       
70 |  *      a: left endpoint
71 |  *      b: right endpoint
72 |  *      n: number of trapezoids
73 |  * Output arg:
74 |  *      integral: estimate of integral from a to b of f(x)
75 |  *****************************************************************************/
76 | void Trap(double a, double b, int n, double* p_global_result)
77 | {
78 |     double h, local_a, local_b;
79 |     int local_n;
80 |     int my_rank = omp_get_thread_num();
81 |     int thread_count = omp_get_num_threads();
82 | 
83 |     h = (b-a)/n;
84 |     local_n = n/thread_count;
85 |     local_a = a + my_rank*local_n*h;
86 |     local_b = local_a + local_n*h;
87 | 
88 |     double my_result = (f(local_a) + f(local_b))/2.0;
89 |     for (int i = 1; i < local_n; i++)
90 |         my_result += f(local_a + i*h);
91 |     my_result = my_result*h;
92 | 
93 | #pragma omp critical
94 |     *p_global_result += my_result;
95 | }


--------------------------------------------------------------------------------
/OpenMP/03_omp_trap2.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        03_omp_trap2.c
  3 |  * Purpose:     Estimate definite integral (or area under curve) using 
  4 |  *              trapezoidal rule. This version uses a hand-coded reduction
  5 |  *              after the function call.
  6 |  * Compile:     gcc -Wall -fopenmp -o 03_omp_trap2 03_omp_trap2.c
  7 |  * Run:         ./03_omp_trap2 <number of threads>
  8 |  * 
  9 |  * Input:       a, b, n
 10 |  * Output:      estimate of integral from a to b of f(x) using n trapezoidals.
 11 |  *****************************************************************************/
 12 | #include <stdio.h>
 13 | #include <stdlib.h>
 14 | #include <omp.h>
 15 | 
 16 | void Usage(char* prog_name);
 17 | double f(double x); /* function we're integrating */
 18 | double Local_trap(double a, double b, int n);
 19 | 
 20 | int main(int argc, char* argv[])
 21 | {
 22 |     if (argc != 2)
 23 |         Usage(argv[0]);
 24 | 
 25 |     int thread_count = strtol(argv[1], NULL, 10);
 26 |     
 27 |     double a, b;
 28 |     int n;
 29 |     printf("Enter a, b, and n\n");
 30 |     scanf("%lf %lf %d", &a, &b, &n);
 31 |     if (n % thread_count != 0)
 32 |         Usage(argv[0]);
 33 |     double start, finish;
 34 |     start = omp_get_wtime();
 35 |     double global_result = 0.0;
 36 | #pragma omp parallel num_threads(thread_count)
 37 |     {
 38 |         double my_result = 0.0;
 39 |         my_result += Local_trap(a, b, n);
 40 | #pragma omp critical
 41 |         global_result += my_result;
 42 |     }
 43 |     finish = omp_get_wtime();
 44 | 
 45 |     printf("With n = %d trapezoids, our estimate\n", n);
 46 |     printf("of the integral from %f to %f = %f\n", a, b, global_result);
 47 |     printf("Elapsed time = %f seconds\n", finish-start);
 48 | 
 49 |     return 0;
 50 | }
 51 | 
 52 | /*****************************************************************************
 53 |  * Function:        Usage
 54 |  * Purpose:         Print a message indicating how program should be started
 55 |  *                  and terminate.
 56 |  *****************************************************************************/
 57 | void Usage(char* prog_name)
 58 | {
 59 |     fprintf(stderr, "Usage: %s <thread_count>\n", prog_name);
 60 |     fprintf(stderr, "   number of trapezoids must be evenly divisible by number of threads\n");
 61 |     exit(0);
 62 | }
 63 | 
 64 | /*****************************************************************************
 65 |  * Function:        f
 66 |  * Purpose:         Compute value of function to be integrated
 67 |  * Input arg:       x
 68 |  * Return val:      f(x)
 69 |  *****************************************************************************/
 70 | double f(double x)
 71 | {
 72 |     return x*x;
 73 | }
 74 | 
 75 | /*****************************************************************************
 76 |  * Function:        Trap
 77 |  * Purpose:         Use trapezoidal rule to estimate part of a definite
 78 |  *                  integral
 79 |  * Input arg:       
 80 |  *      a: left endpoint
 81 |  *      b: right endpoint
 82 |  *      n: number of trapezoids
 83 |  * Return: estimate of integral from local_a to local_b
 84 |  *****************************************************************************/
 85 | double Local_trap(double a, double b, int n)
 86 | {
 87 |     double h, local_a, local_b;
 88 |     int local_n;
 89 |     int my_rank = omp_get_thread_num();
 90 |     int thread_count = omp_get_num_threads();
 91 | 
 92 |     h = (b-a)/n;
 93 |     local_n = n/thread_count;
 94 |     local_a = a + my_rank*local_n*h;
 95 |     local_b = local_a + local_n*h;
 96 | 
 97 |     double my_result = (f(local_a) + f(local_b))/2.0;
 98 |     for (int i = 1; i < local_n; i++)
 99 |         my_result += f(local_a + i*h);
100 |     my_result = my_result*h;
101 | 
102 |     return my_result;
103 | }


--------------------------------------------------------------------------------
/OpenMP/04_omp_trap3.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        04_omp_trap3.c
 3 |  * Purpose:     Estimate definite integral (or area under curve) using 
 4 |  *              trapezoidal rule. This version uses a reduction clause.
 5 |  * Compile:     gcc -Wall -fopenmp -o 04_omp_trap3 04_omp_trap3.c
 6 |  * Run:         ./04_omp_trap3 <number of threads>
 7 |  * 
 8 |  * Input:       a, b, n
 9 |  * Output:      estimate of integral from a to b of f(x) using n trapezoidals.
10 |  *****************************************************************************/
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | #include <omp.h>
14 | 
15 | void Usage(char* prog_name);
16 | double f(double x); /* function we're integrating */
17 | double Local_trap(double a, double b, int n);
18 | 
19 | int main(int argc, char* argv[])
20 | {
21 |     if (argc != 2)
22 |         Usage(argv[0]);
23 | 
24 |     int thread_count = strtol(argv[1], NULL, 10);
25 |     
26 |     double a, b;
27 |     int n;
28 |     printf("Enter a, b, and n\n");
29 |     scanf("%lf %lf %d", &a, &b, &n);
30 |     if (n % thread_count != 0)
31 |         Usage(argv[0]);
32 | 
33 |     double global_result = 0.0;
34 | #pragma omp parallel num_threads(thread_count) \
35 |         reduction(+: global_result)
36 |     global_result += Local_trap(a, b, n);
37 | 
38 |     printf("With n = %d trapezoids, our estimate\n", n);
39 |     printf("of the integral from %f to %f = %f\n", a, b, global_result);
40 | 
41 |     return 0;
42 | }
43 | 
44 | /*****************************************************************************
45 |  * Function:        Usage
46 |  * Purpose:         Print a message indicating how program should be started
47 |  *                  and terminate.
48 |  *****************************************************************************/
49 | void Usage(char* prog_name)
50 | {
51 |     fprintf(stderr, "Usage: %s <thread_count>\n", prog_name);
52 |     fprintf(stderr, "   number of trapezoids must be evenly divisible by number of threads\n");
53 |     exit(0);
54 | }
55 | 
56 | /*****************************************************************************
57 |  * Function:        f
58 |  * Purpose:         Compute value of function to be integrated
59 |  * Input arg:       x
60 |  * Return val:      f(x)
61 |  *****************************************************************************/
62 | double f(double x)
63 | {
64 |     return x*x;
65 | }
66 | 
67 | /*****************************************************************************
68 |  * Function:        Trap
69 |  * Purpose:         Use trapezoidal rule to estimate part of a definite
70 |  *                  integral
71 |  * Input arg:       
72 |  *      a: left endpoint
73 |  *      b: right endpoint
74 |  *      n: number of trapezoids
75 |  * Return: estimate of integral from local_a to local_b
76 |  *****************************************************************************/
77 | double Local_trap(double a, double b, int n)
78 | {
79 |     double h, local_a, local_b;
80 |     int local_n;
81 |     int my_rank = omp_get_thread_num();
82 |     int thread_count = omp_get_num_threads();
83 | 
84 |     h = (b-a)/n;
85 |     local_n = n/thread_count;
86 |     local_a = a + my_rank*local_n*h;
87 |     local_b = local_a + local_n*h;
88 | 
89 |     double my_result = (f(local_a) + f(local_b))/2.0;
90 |     for (int i = 1; i < local_n; i++)
91 |         my_result += f(local_a + i*h);
92 |     my_result = my_result*h;
93 | 
94 |     return my_result;
95 | }


--------------------------------------------------------------------------------
/OpenMP/05_omp_trap4.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        05_omp_trap4.c
 3 |  * Purpose:     Estimate definite integral (or area under curve) using 
 4 |  *              trapezoidal rule. This version uses a parallel for directive.
 5 |  * Compile:     gcc -Wall -fopenmp -o 05_omp_trap4 05_omp_trap4.c
 6 |  * Run:         ./05_omp_trap4 <number of threads>
 7 |  * 
 8 |  * Input:       a, b, n
 9 |  * Output:      estimate of integral from a to b of f(x) using n trapezoidals.
10 |  * 
11 |  * Note:        In this version, it's not necessary for n to be evenly divisible
12 |  *              by thread_count
13 |  *****************************************************************************/
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <omp.h>
17 | 
18 | void Usage(char* prog_name);
19 | double f(double x); /* function we're integrating */
20 | double Trap(double a, double b, int n, int thread_count);
21 | 
22 | int main(int argc, char* argv[])
23 | {
24 |     if (argc != 2)
25 |         Usage(argv[0]);
26 | 
27 |     int thread_count = strtol(argv[1], NULL, 10);
28 |     
29 |     double a, b;
30 |     int n;
31 |     printf("Enter a, b, and n\n");
32 |     scanf("%lf %lf %d", &a, &b, &n);
33 | 
34 |     double global_result = 0.0;
35 |     global_result = Trap(a, b, n, thread_count);
36 | 
37 |     printf("With n = %d trapezoids, our estimate\n", n);
38 |     printf("of the integral from %f to %f = %f\n", a, b, global_result);
39 | 
40 |     return 0;
41 | }
42 | 
43 | /*****************************************************************************
44 |  * Function:        Usage
45 |  * Purpose:         Print a message indicating how program should be started
46 |  *                  and terminate.
47 |  *****************************************************************************/
48 | void Usage(char* prog_name)
49 | {
50 |     fprintf(stderr, "Usage: %s <thread_count>\n", prog_name);
51 |     fprintf(stderr, "   number of trapezoids must be evenly divisible by number of threads\n");
52 |     exit(0);
53 | }
54 | 
55 | /*****************************************************************************
56 |  * Function:        f
57 |  * Purpose:         Compute value of function to be integrated
58 |  * Input arg:       x
59 |  * Return val:      f(x)
60 |  *****************************************************************************/
61 | double f(double x)
62 | {
63 |     return x*x;
64 | }
65 | 
66 | /*****************************************************************************
67 |  * Function:        Trap
68 |  * Purpose:         Use trapezoidal rule to estimate part of a definite
69 |  *                  integral
70 |  * Input arg:       
71 |  *      a: left endpoint
72 |  *      b: right endpoint
73 |  *      n: number of trapezoids
74 |  * Return: 
75 |  *      approx: estimate of integral from a to b of f(x)
76 |  *****************************************************************************/
77 | double Trap(double a, double b, int n, int thread_count)
78 | {
79 |     double h, approx;
80 | 
81 |     h = (b-a)/n;
82 |     approx = (f(a) + f(b))/2.0;
83 | #pragma omp parallel for num_threads(thread_count) \
84 |     reduction(+: approx)
85 |     for (int i = 1; i < n; i++)
86 |         approx += f(a + i*h);
87 |     approx = approx*h;
88 | 
89 |     return approx;
90 | }


--------------------------------------------------------------------------------
/OpenMP/06_omp_fibo.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        06_omp_fibo.c
 3 |  * Purpose:     Try to compute n Fibonacci number using OpenMP.
 4 |  *              Show what happens if we try to parallelize a loop with
 5 |  *              dependences among the iterations. The program has a serious bug.
 6 |  * Compile:     gcc -Wall -fopenmp -o 06_omp_fibo 06_omp_fibo.c
 7 |  * Run:         ./06_omp_fibo <number of threads> <number of Fibonacci numbers>
 8 |  * 
 9 |  * Input:       none
10 |  * Output:      A list of Fibonacci numbers
11 |  * 
12 |  * Note:        If your output seems to be OK, try increasing the number of 
13 |  *              threads and/or n.
14 |  *****************************************************************************/
15 | #include <stdio.h>
16 | #include <stdlib.h>
17 | #include <omp.h>
18 | 
19 | void Usage(char* prog_name);
20 | 
21 | int main(int argc, char* argv[])
22 | {
23 |     int thread_count, n;
24 |     long long* fibo;
25 | 
26 |     if (argc != 3)
27 |         Usage(argv[0]);
28 |     
29 |     thread_count = strtol(argv[1], NULL, 10);
30 |     n = strtol(argv[2], NULL, 10);
31 | 
32 |     fibo = (long long*)malloc(n * sizeof(long long));
33 |     fibo[0] = fibo[1] = 1;
34 | 
35 | #pragma omp parallel for num_threads(thread_count)
36 |     for (int i = 2; i < n; i++)
37 |         fibo[i] = fibo[i-1] + fibo[i-2];
38 |     
39 |     printf("The first n Fibonacci numbers:\n");
40 |     for (int i = 0; i < n; i++)
41 |         printf("%d\t%lld\n", i, fibo[i]);
42 |     
43 |     free(fibo);
44 | 
45 |     return 0;
46 | }
47 | 
48 | /*****************************************************************************
49 |  * Function:        Usage
50 |  * Purpose:         Print a message indicating how program should be started
51 |  *                  and terminate.
52 |  *****************************************************************************/
53 | void Usage(char* prog_name)
54 | {
55 |     fprintf(stderr, "Usage: %s <thread_count> <number of Fibonacci numbers\n", prog_name);
56 |     exit(0);
57 | }


--------------------------------------------------------------------------------
/OpenMP/07_omp_pi.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        07_omp_pi.c
 3 |  * Purpose:     Estimate pi using OpenMP and the formula
 4 |  *                  pi = 4*[1 - 1/3 + 1/5 - 1/7 + 1/9 - . . . ]
 5 |  * Compile:     gcc -Wall -fopenmp -o 07_omp_pi 07_omp_pi.c [-lm]
 6 |  * Run:         ./07_omp_pi <number of threads> <n>
 7 |  *              <n> is the number of terms of the series to use
 8 |  * 
 9 |  * Input:       none
10 |  * Output:      The estimate of pi and the value of pi computed by the arctan
11 |  *              function in the math library
12 |  *****************************************************************************/
13 | #include <stdio.h>
14 | #include <stdlib.h>
15 | #include <math.h>
16 | #include <omp.h>
17 | 
18 | void Usage(char* prog_name);
19 | 
20 | int main(int argc, char* argv[])
21 | {
22 |     int thread_count;
23 |     long long n;
24 | 
25 |     if (argc != 3)
26 |         Usage(argv[0]);
27 |     thread_count = strtol(argv[1], NULL, 10);
28 |     n = strtoll(argv[2], NULL, 10);
29 |     if (thread_count < 1 || n < 1)
30 |         Usage(argv[0]);
31 | 
32 |     double factor, sum = 0.0;
33 | #pragma omp parallel for num_threads(thread_count) \
34 |     reduction(+: sum) private(factor)
35 |     for (int i = 0; i < n; i++) {
36 |         factor = (i % 2 == 0) ? 1.0 : -1.0;
37 |         sum += factor/(2*i + 1);
38 | #ifdef DEBUG
39 |     printf("Thread %d > i = %d, my_sum = %f\n", omp_get_thread_num(), i, sum);
40 | #endif
41 |     }
42 | 
43 |     sum = 4.0*sum;
44 |     printf("With n = %lld terms and %d threads,\n", n, thread_count);
45 |     printf("    Our estimate of pi = %.14f\n", sum);
46 |     printf("                    pi = %.14f\n", 4.0*atan(1.0));
47 | 
48 |     return 0;
49 | }
50 | 
51 | /*****************************************************************************
52 |  * Function:        Usage
53 |  * Purpose:         Print a message indicating how program should be started
54 |  *                  and terminate.
55 |  *****************************************************************************/
56 | void Usage(char* prog_name)
57 | {
58 |     fprintf(stderr, "Usage: %s <thread_count> <n>\n", prog_name);
59 |     fprintf(stderr, "   thread_count is the number of threads >= 1\n");
60 |     fprintf(stderr, "   n is the number of terms and should be >= 1\n");
61 |     exit(0);
62 | }


--------------------------------------------------------------------------------
/OpenMP/13_omp_private.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        13_omp_private.c
 3 |  * Purpose:     Print the value of a private variable at the beginning of
 4 |  *              a parallel block and after the end of the block
 5 |  * Compile:     gcc -Wall -fopenmp -o 13_omp_private 13_omp_private.c
 6 |  * Run:         ./13_omp_private <number of threads>
 7 |  * 
 8 |  * Input:       none
 9 |  * Output:      Value of int at various points in the program
10 |  *****************************************************************************/
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | #include <omp.h>
14 | 
15 | int main(int argc, char* argv[])
16 | {
17 |     int x = 5;
18 |     int thread_count = strtol(argv[1], NULL, 10);
19 | 
20 | #pragma omp parallel num_threads(thread_count) \
21 |     private(x)
22 |     {
23 |         int my_rank = omp_get_thread_num();
24 |         printf("Thread %d > before initialization, x = %d\n", my_rank, x);
25 |         x = 2*my_rank + 2;
26 |         printf("Thread %d > after initialization, x = %d\n", my_rank, x);
27 |     }
28 |     printf("After parallel block, x = %d\n", x);
29 | 
30 |     return 0;
31 | }


--------------------------------------------------------------------------------
/OpenMP/queue/queue.h:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        queue.h
 3 |  * Purpose:     Header file for queue.c which implements a queue of messages or
 4 |  *              pairs of ints (source + contents) as a linked list.
 5 |  *****************************************************************************/
 6 | #ifndef _QUEUE_H_
 7 | #define _QUEUE_H_
 8 | #ifdef USE_OMP_LOCK
 9 | #include <omp.h>
10 | #endif
11 | 
12 | typedef struct queue_node_s {
13 |     int src;
14 |     int msg;
15 |     struct queue_node_s* next_p;
16 | } QNode;
17 | 
18 | typedef struct queue_s {
19 | #ifdef USE_OMP_LOCK
20 |     omp_lock_t lock;
21 | #endif
22 |     int enqueued;
23 |     int dequeued;
24 |     QNode* front_p;
25 |     QNode* tail_p;
26 | } Queue;
27 | 
28 | Queue* Allocate_queue(void);
29 | void Free_queue(Queue* q);
30 | void Print_queue(Queue* q);
31 | void Enqueue(Queue* q, int src, int msg);
32 | int Dequeue(Queue* q, int* src, int* msg);
33 | int Search(Queue* q, int msg, int* src);
34 | 
35 | #endif


--------------------------------------------------------------------------------
/cblas_mat_mul.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        cblas_mat_mul.c
  3 |  * Purpose:     Compute a matrix-matrix product by using OpenBLAS library.
  4 |  * Compile:     g++ -Wall -o cblas_mat_mul cblas_mat_mul.c -lopenblas
  5 |  * Run:         ./cblas_mat_mul <m> <n> <k>
  6 |  *                  <m> : the rows of matrix A
  7 |  *                  <n> : the columns of matrix A and the rows of matrix B
  8 |  *                  <k> : the columns of matrix B
  9 |  * 
 10 |  * Input:       A, B
 11 |  * Output:      
 12 |  *              C: the product matrix, C = AB
 13 |  *              Elapsed time each multiplication and average elapsed time of
 14 |  *              100 multiplications
 15 |  *****************************************************************************/
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <cblas.h>
 20 | #include <sys/time.h>
 21 | 
 22 | #define GET_TIME(now) { \
 23 |     struct timeval t; \
 24 |     gettimeofday(&t, NULL); \
 25 |     now = t.tv_sec + t.tv_usec/1000000.0; \
 26 | }
 27 | 
 28 | const int RMAX = 1000000;
 29 | #ifdef DEBUG
 30 | const int NCOUNT = 1; // number of multiplication
 31 | #else
 32 | const int NCOUNT = 100; // number of multiplication
 33 | #endif
 34 | 
 35 | void Get_args(int argc, char* argv[], int* m, int* n, int* k);
 36 | void Usage(char* prog_name);
 37 | void Generate_matrix(double mat[], int m, int n);
 38 | void Print_matrix(double mat[], int m, int n, char* title);
 39 | 
 40 | int main(int argc, char* argv[])
 41 | {
 42 |     int m, n, k;
 43 |     Get_args(argc, argv, &m, &n, &k);
 44 | 
 45 |     double *A, *B, *C;
 46 |     A = (double*)malloc(m * n * sizeof(double));
 47 |     B = (double*)malloc(n * k * sizeof(double));
 48 |     C = (double*)malloc(m * k * sizeof(double));
 49 | 
 50 |     Generate_matrix(A, m, n);
 51 |     Generate_matrix(B, n, k);
 52 | #ifdef DEBUG
 53 |     Print_matrix(A, m, n, "A");
 54 |     Print_matrix(B, n, k, "B");
 55 | #endif
 56 | 
 57 |     double start, finish, avg_elapsed = 0.0;
 58 |     for (int count = 0; count < NCOUNT; count++) {
 59 |         GET_TIME(start);
 60 |         cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, k, n, 1.0, A, n, B, k, 0, C, k);
 61 |         GET_TIME(finish);
 62 | 
 63 |         printf("[%3d] Elapsed time = %.6f seconds\n", count+1, finish-start);
 64 |         avg_elapsed += (finish - start) / NCOUNT;
 65 |     }
 66 |     
 67 | #ifdef DEBUG
 68 |     Print_matrix(C, m, k, "The product is");
 69 | #endif
 70 | 
 71 |     printf("Average elapsed time = %.6f seconds\n", avg_elapsed);
 72 | 
 73 |     free(A);
 74 |     free(B);
 75 |     free(C);
 76 | 
 77 |     return 0;
 78 | }
 79 | 
 80 | /*****************************************************************************
 81 |  * Function:        Get_args
 82 |  * Purpose:         Get and check command list arguments
 83 |  * In args:         argc, argv
 84 |  * Out args:        m, n, k
 85 |  *****************************************************************************/
 86 | void Get_args(int argc, char* argv[], int* m, int* n, int* k)
 87 | {
 88 |     if (argc != 4)
 89 |         Usage(argv[0]);
 90 |     
 91 |     *m = strtol(argv[1], NULL, 10);
 92 |     *n = strtol(argv[2], NULL, 10);
 93 |     *k = strtol(argv[3], NULL, 10);
 94 |     if (*m <= 0 || *n <= 0 || *k <= 0)
 95 |         Usage(argv[0]);
 96 | }
 97 | 
 98 | /*****************************************************************************
 99 |  * Function:        Usage
100 |  * Purpose:         Print a message indicating how program should be started
101 |  *                  and terminate.
102 |  * In arg:          prog_name
103 |  *****************************************************************************/
104 | void Usage(char* prog_name)
105 | {
106 |     fprintf(stderr, "Usage: %s <m> <n> <k>\n", prog_name);
107 |     exit(0);
108 | }
109 | 
110 | /*****************************************************************************
111 |  * Function:        Generate_matrix
112 |  * Purpose:         Generate matrix entries by using the random number generator
113 |  * In args:         m, n
114 |  * Out arg:         mat
115 |  *****************************************************************************/
116 | void Generate_matrix(double mat[], int m, int n)
117 | {
118 |     for (int i = 0; i < m; i++)
119 |         for (int j = 0; j < n; j++)
120 |             mat[i*n + j] = (rand() % RMAX) / (RMAX / 10.0);
121 | }
122 | 
123 | /*****************************************************************************
124 |  * Function:        Print_matrix
125 |  * Purpose:         Print the matrix
126 |  * In args:         mat, m, n, title
127 |  *****************************************************************************/
128 | void Print_matrix(double mat[], int m, int n, char* title)
129 | {
130 |     printf("%s\n", title);
131 |     for (int i = 0; i < m; i++) {
132 |         for (int j = 0; j < n; j++)
133 |             printf("%f ", mat[i*n + j]);
134 |         printf("\n");
135 |     }
136 | }


--------------------------------------------------------------------------------
/mkl_mat_mul.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        mkl_mat_mul.c
  3 |  * Purpose:     Compute a matrix-matrix product by using Intel MKL library.
  4 |  * Compile:     gcc -Wall -o mkl_mat_mul mkl_mat_mul.c $(pkg-config mkl-static-lp64-iomp --libs --cflags)
  5 |  * Run:         ./mkl_mat_mul <m> <n> <k>
  6 |  *                  <m> : the rows of matrix A
  7 |  *                  <n> : the columns of matrix A and the rows of matrix B
  8 |  *                  <k> : the columns of matrix B
  9 |  * 
 10 |  * Input:       A, B
 11 |  * Output:      
 12 |  *              C: the product matrix, C = AB
 13 |  *              Elapsed time each multiplication and average elapsed time of
 14 |  *              100 multiplications
 15 |  *****************************************************************************/
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <mkl/mkl.h>
 20 | #include <sys/time.h>
 21 | 
 22 | #define GET_TIME(now) { \
 23 |     struct timeval t; \
 24 |     gettimeofday(&t, NULL); \
 25 |     now = t.tv_sec + t.tv_usec/1000000.0; \
 26 | }
 27 | 
 28 | const int RMAX = 1000000;
 29 | #ifdef DEBUG
 30 | const int NCOUNT = 1; // number of multiplication
 31 | #else
 32 | const int NCOUNT = 100; // number of multiplication
 33 | #endif
 34 | 
 35 | void Get_args(int argc, char* argv[], int* m, int* n, int* k);
 36 | void Usage(char* prog_name);
 37 | void Generate_matrix(double mat[], int m, int n);
 38 | void Print_matrix(double mat[], int m, int n, char* title);
 39 | 
 40 | int main(int argc, char* argv[])
 41 | {
 42 |     int m, n, k;
 43 |     Get_args(argc, argv, &m, &n, &k);
 44 | 
 45 |     double *A, *B, *C;
 46 |     A = (double*)mkl_malloc(m * n * sizeof(double), 64);
 47 |     B = (double*)mkl_malloc(n * k * sizeof(double), 64);
 48 |     C = (double*)mkl_malloc(m * k * sizeof(double), 64);
 49 | 
 50 |     Generate_matrix(A, m, n);
 51 |     Generate_matrix(B, n, k);
 52 | #ifdef DEBUG
 53 |     Print_matrix(A, m, n, "A");
 54 |     Print_matrix(B, n, k, "B");
 55 | #endif
 56 | 
 57 | 
 58 |     double start, finish, avg_elapsed = 0.0;
 59 |     for (int count = 0; count < NCOUNT; count++) {
 60 |         GET_TIME(start);
 61 |         cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, k, n, 1.0, A, n, B, k, 0, C, k);
 62 |         GET_TIME(finish);
 63 | 
 64 |         printf("[%3d] Elapsed time = %.6f seconds\n", count+1, finish-start);
 65 |         avg_elapsed += (finish - start) / NCOUNT;
 66 |     }
 67 |     
 68 | #ifdef DEBUG
 69 |     Print_matrix(C, m, k, "The product is");
 70 | #endif
 71 | 
 72 |     printf("Average elapsed time = %.6f seconds\n", avg_elapsed);
 73 | 
 74 |     mkl_free(A);
 75 |     mkl_free(B);
 76 |     mkl_free(C);
 77 | 
 78 |     return 0;
 79 | }
 80 | 
 81 | /*****************************************************************************
 82 |  * Function:        Get_args
 83 |  * Purpose:         Get and check command list arguments
 84 |  * In args:         argc, argv
 85 |  * Out args:        m, n, k
 86 |  *****************************************************************************/
 87 | void Get_args(int argc, char* argv[], int* m, int* n, int* k)
 88 | {
 89 |     if (argc != 4)
 90 |         Usage(argv[0]);
 91 |     
 92 |     *m = strtol(argv[1], NULL, 10);
 93 |     *n = strtol(argv[2], NULL, 10);
 94 |     *k = strtol(argv[3], NULL, 10);
 95 |     if (*m <= 0 || *n <= 0 || *k <= 0)
 96 |         Usage(argv[0]);
 97 | }
 98 | 
 99 | /*****************************************************************************
100 |  * Function:        Usage
101 |  * Purpose:         Print a message indicating how program should be started
102 |  *                  and terminate.
103 |  * In arg:          prog_name
104 |  *****************************************************************************/
105 | void Usage(char* prog_name)
106 | {
107 |     fprintf(stderr, "Usage: %s <m> <n> <k>\n", prog_name);
108 |     exit(0);
109 | }
110 | 
111 | /*****************************************************************************
112 |  * Function:        Generate_matrix
113 |  * Purpose:         Generate matrix entries by using the random number generator
114 |  * In args:         m, n
115 |  * Out arg:         mat
116 |  *****************************************************************************/
117 | void Generate_matrix(double mat[], int m, int n)
118 | {
119 |     for (int i = 0; i < m; i++)
120 |         for (int j = 0; j < n; j++)
121 |             mat[i*n + j] = (rand() % RMAX) / (RMAX / 10.0);
122 | }
123 | 
124 | /*****************************************************************************
125 |  * Function:        Print_matrix
126 |  * Purpose:         Print the matrix
127 |  * In args:         mat, m, n, title
128 |  *****************************************************************************/
129 | void Print_matrix(double mat[], int m, int n, char* title)
130 | {
131 |     printf("%s\n", title);
132 |     for (int i = 0; i < m; i++) {
133 |         for (int j = 0; j < n; j++)
134 |             printf("%f ", mat[i*n + j]);
135 |         printf("\n");
136 |     }
137 | }


--------------------------------------------------------------------------------
/mpi/00_mpi_hello.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <mpi.h>
 4 | 
 5 | const int MAX_STRING = 100;
 6 | 
 7 | int main(void)
 8 | {
 9 | 	char greeting[MAX_STRING];
10 | 	int comm_sz;
11 | 	int my_rank;
12 | 
13 | 	MPI_Init(NULL, NULL);
14 | 	MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
15 | 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
16 | 
17 | 	if (my_rank != 0) {
18 | 		sprintf(greeting, "Greetings from process %d of %d!", my_rank, comm_sz);
19 | 		MPI_Send(greeting, strlen(greeting) + 1, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
20 | 	}
21 | 	else {
22 | 		printf("Greetings from process %d of %d!\n", my_rank, comm_sz);
23 | 		for (int q = 1; q < comm_sz; q++) {
24 | 			MPI_Recv(greeting, MAX_STRING, MPI_CHAR, q, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
25 | 			printf("%s\n", greeting);
26 | 		}
27 | 	}
28 | 
29 | 	MPI_Finalize();
30 | 
31 | 	return 0;
32 | }
33 | 


--------------------------------------------------------------------------------
/mpi/01_serial_trap.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * File:        01_serial_trap.c
 3 |  * Purpose:     Calculate area by using trapezoidal rule
 4 |  * Compile:     gcc -Wall -o 01_serial_trap 01_serial_trap.c
 5 |  * Run:
 6 |  *  01_serial_trap <a> <b> <n>
 7 |  *      - a: left end-point
 8 |  *      - b: right end-point
 9 |  *      - n: the number of subinterval
10 | */
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | 
14 | double f(double x);
15 | double Trap(double a, double b, int n, double h);
16 | 
17 | int main(int argc, char** argv)
18 | {
19 |     double integral;
20 |     double a, b;
21 |     int n;
22 |     double h;
23 | 
24 |     if (argc != 4) {
25 |         fprintf(stderr, "usage: %s <a> <b> <n>\n", argv[0]);
26 |         fprintf(stderr, "   a: left end-point\n");
27 |         fprintf(stderr, "   b: right end-point\n");
28 |         fprintf(stderr, "   n: the number of subinterval\n");
29 |         exit(-1);
30 |     }
31 | 
32 |     a = atof(argv[1]);
33 |     b = atof(argv[2]);
34 |     n = atoi(argv[3]);
35 | 
36 |     h = (b-a)/n;
37 |     integral = Trap(a, b, n, h);
38 | 
39 |     printf("With n = %d trapezoids, our estimate\n", n);
40 |     printf("of the integral from %f to %f = %.15f\n", a, b, integral);
41 | 
42 |     return 0;
43 | }
44 | 
45 | double Trap(double a, double b, int n, double h)
46 | {
47 |     double integral;
48 |     
49 |     integral = (f(a) + f(b)) / 2.0;
50 | 
51 |     for(int k = 0; k < n; k++) {
52 |         integral += f(a + k*h);
53 |     }
54 |     integral = integral * h;
55 | 
56 |     return integral;
57 | }
58 | 
59 | double f(double x)
60 | {
61 |     return x*x;
62 | }


--------------------------------------------------------------------------------
/mpi/02_mpi_trap1.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * File:        02_mpi_trap1.c
 3 |  * Purpose:     Use MPI to implement a parallel version of the trapezoidal rule.
 4 |  *              In this version the endpoints of the interval and
 5 |  *              the number of trapezoids are hardwired.
 6 |  * Compile:     mpicc -Wall -o 02_mpi_trap1 02_mpi_trap1.c
 7 |  * Run:         mpiexec -n <number of proesses> ./02_mpi_trap1
 8 |  * 
 9 |  * Algorithm:
10 |  *    1.  Each process calculates "its" interval of
11 |  *        integration.
12 |  *    2.  Each process estimates the integral of f(x)
13 |  *        over its interval using the trapezoidal rule.
14 |  *    3a. Each process != 0 sends its integral to 0.
15 |  *    3b. Process 0 sums the calculations received from
16 |  *        the individual processes and prints the result.
17 |  */
18 | #include <stdio.h>
19 | #include <stdlib.h>
20 | #include <mpi.h>
21 | 
22 | double Trap(double a, double b, int n, double h);
23 | double f(double x);
24 | 
25 | int main(void)
26 | {
27 |     int my_rank, comm_sz, n = 1024, local_n;
28 |     double a = 0.0, b = 3.0, h, local_a, local_b, local_int, total_int;
29 | 
30 |     MPI_Init(NULL, NULL);
31 |     MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
32 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
33 | 
34 |     h = (b-a)/n;
35 |     local_n = n/comm_sz;
36 | 
37 |     local_a = a + my_rank*local_n*h;
38 |     local_b = local_a + local_n*h;
39 |     local_int = Trap(local_a, local_b, local_n, h);
40 | 
41 |     if (my_rank != 0) {
42 |         MPI_Send(&local_int, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
43 |     }
44 |     else {
45 |         total_int = local_int;
46 |         for (int source = 1; source < comm_sz; source++) {
47 |             MPI_Recv(&local_int, 1, MPI_DOUBLE, source, 0,
48 |                     MPI_COMM_WORLD, MPI_STATUS_IGNORE);
49 |             total_int += local_int;
50 |         }
51 |     }
52 | 
53 |     if (my_rank == 0) {
54 |         printf("With n = %d trapezoids, our estimate\n", n);
55 |         printf("of the integral from %f to %f = %.15f\n", a, b, total_int);
56 |     }
57 | 
58 |     MPI_Finalize();
59 | 
60 |     return 0;
61 | }
62 | 
63 | double Trap(double a, double b, int n, double h)
64 | {
65 |     double integral;
66 |     
67 |     integral = (f(a) + f(b)) / 2.0;
68 | 
69 |     for(int k = 0; k < n; k++) {
70 |         integral += f(a + k*h);
71 |     }
72 |     integral = integral * h;
73 | 
74 |     return integral;
75 | }
76 | 
77 | double f(double x)
78 | {
79 |     return x*x;
80 | }


--------------------------------------------------------------------------------
/mpi/03_mpi_output.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * File:        03_mpi_output.c
 3 |  * Purpose:     A program in which multiple MPI processes try to print a message.
 4 |  * Compile:     mpicc -Wall -o 03_mpi_output 03_mpi_output.c
 5 |  * Run:         mpiexec -n <number of proesses> ./03_mpi_output
 6 |  */
 7 | #include <stdio.h>
 8 | #include <mpi.h>
 9 | 
10 | int main(void)
11 | {
12 |     int my_rank, comm_sz;
13 | 
14 |     MPI_Init(NULL, NULL);
15 |     MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
16 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
17 | 
18 |     printf("Proc %d of %d > Done anyone have a toothpick?\n", my_rank, comm_sz);
19 | 
20 |     MPI_Finalize();
21 | 
22 |     return 0;
23 | }


--------------------------------------------------------------------------------
/mpi/04_mpi_trap2.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * File:        04_mpi_trap2.c
 3 |  * Purpose:     Use MPI to implement a parallel version of the trapezoidal rule.
 4 |  *              This version accepts input of the endpoints of the interval and
 5 |  *              the number of trapezoids.
 6 |  * Compile:     mpicc -Wall -o 04_mpi_trap2 04_mpi_trap2.c
 7 |  * Run:         mpiexec -n <number of proesses> ./04_mpi_trap2
 8 |  * 
 9 |  * Algorithm:
10 |  *    1.  Each process calculates "its" interval of
11 |  *        integration.
12 |  *    2.  Each process estimates the integral of f(x)
13 |  *        over its interval using the trapezoidal rule.
14 |  *    3a. Each process != 0 sends its integral to 0.
15 |  *    3b. Process 0 sums the calculations received from
16 |  *        the individual processes and prints the result.
17 |  */
18 | #include <stdio.h>
19 | #include <stdlib.h>
20 | #include <mpi.h>
21 | 
22 | double Trap(double a, double b, int n, double h);
23 | double f(double x);
24 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n);
25 | 
26 | int main(void)
27 | {
28 |     int my_rank, comm_sz, n, local_n;
29 |     double a, b, h, local_a, local_b, local_int, total_int;
30 | 
31 |     MPI_Init(NULL, NULL);
32 |     MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
33 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
34 | 
35 |     Get_input(my_rank, comm_sz, &a, &b, &n);
36 | 
37 |     h = (b-a)/n;
38 |     local_n = n/comm_sz;
39 | 
40 |     local_a = a + my_rank*local_n*h;
41 |     local_b = local_a + local_n*h;
42 |     local_int = Trap(local_a, local_b, local_n, h);
43 | 
44 |     MPI_Reduce(&local_int, &total_int, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
45 | 
46 |     if (my_rank == 0) {
47 |         printf("With n = %d trapezoids, our estimate\n", n);
48 |         printf("of the integral from %f to %f = %.15f\n", a, b, total_int);
49 |     }
50 | 
51 |     MPI_Finalize();
52 | 
53 |     return 0;
54 | }
55 | 
56 | double Trap(double a, double b, int n, double h)
57 | {
58 |     double integral;
59 |     
60 |     integral = (f(a) + f(b)) / 2.0;
61 | 
62 |     for(int k = 0; k < n; k++) {
63 |         integral += f(a + k*h);
64 |     }
65 |     integral = integral * h;
66 | 
67 |     return integral;
68 | }
69 | 
70 | double f(double x)
71 | {
72 |     return x*x;
73 | }
74 | 
75 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n)
76 | {
77 |     int dest;
78 | 
79 |     if (my_rank == 0) {
80 |         printf("Enter a, b, and n\n");
81 |         scanf("%lf %lf %d", p_a, p_b, p_n);
82 |         for (dest = 1; dest < comm_sz; dest++) {
83 |             MPI_Send(p_a, 1, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD);
84 |             MPI_Send(p_b, 1, MPI_DOUBLE, dest, 0, MPI_COMM_WORLD);
85 |             MPI_Send(p_n, 1, MPI_INT, dest, 0, MPI_COMM_WORLD);
86 |         } 
87 |     } 
88 |     else { /* my_rank != 0 */
89 |         MPI_Recv(p_a, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD,
90 |             MPI_STATUS_IGNORE);
91 |         MPI_Recv(p_b, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD,
92 |             MPI_STATUS_IGNORE);
93 |         MPI_Recv(p_n, 1, MPI_INT, 0, 0, MPI_COMM_WORLD,
94 |             MPI_STATUS_IGNORE);
95 |     } 
96 | }


--------------------------------------------------------------------------------
/mpi/05_mpi_trap3.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * File:        05_mpi_trap3.c
 3 |  * Purpose:     Use MPI to implement a parallel version of the trapezoidal rule.
 4 |  *              This version uses collective communications to distribute
 5 |  *              the input data and compute the global sum.
 6 |  * Compile:     mpicc -Wall -o 05_mpi_trap3 05_mpi_trap3.c
 7 |  * Run:         mpiexec -n <number of proesses> ./05_mpi_trap3
 8 |  * 
 9 |  * Algorithm:
10 |  *    1.  Each process calculates "its" interval of
11 |  *        integration.
12 |  *    2.  Each process estimates the integral of f(x)
13 |  *        over its interval using the trapezoidal rule.
14 |  *    3a. Each process != 0 sends its integral to 0.
15 |  *    3b. Process 0 sums the calculations received from
16 |  *        the individual processes and prints the result.
17 |  */
18 | #include <stdio.h>
19 | #include <stdlib.h>
20 | #include <mpi.h>
21 | 
22 | double Trap(double a, double b, int n, double h);
23 | double f(double x);
24 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n);
25 | 
26 | int main(void)
27 | {
28 |     int my_rank, comm_sz, n, local_n;
29 |     double a, b, h, local_a, local_b, local_int, total_int;
30 | 
31 |     MPI_Init(NULL, NULL);
32 |     MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
33 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
34 | 
35 |     Get_input(my_rank, comm_sz, &a, &b, &n);
36 | 
37 |     h = (b-a)/n;
38 |     local_n = n/comm_sz;
39 | 
40 |     local_a = a + my_rank*local_n*h;
41 |     local_b = local_a + local_n*h;
42 |     local_int = Trap(local_a, local_b, local_n, h);
43 | 
44 |     MPI_Reduce(&local_int, &total_int, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
45 | 
46 |     if (my_rank == 0) {
47 |         printf("With n = %d trapezoids, our estimate\n", n);
48 |         printf("of the integral from %f to %f = %.15f\n", a, b, total_int);
49 |     }
50 | 
51 |     MPI_Finalize();
52 | 
53 |     return 0;
54 | }
55 | 
56 | double Trap(double a, double b, int n, double h)
57 | {
58 |     double integral;
59 |     
60 |     integral = (f(a) + f(b)) / 2.0;
61 | 
62 |     for(int k = 0; k < n; k++) {
63 |         integral += f(a + k*h);
64 |     }
65 |     integral = integral * h;
66 | 
67 |     return integral;
68 | }
69 | 
70 | double f(double x)
71 | {
72 |     return x*x;
73 | }
74 | 
75 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n)
76 | {
77 |     if (my_rank == 0) {
78 |         printf("Enter a, b, and n\n");
79 |         scanf("%lf %lf %d", p_a, p_b, p_n);
80 |     }
81 | 
82 |     MPI_Bcast(p_a, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
83 |     MPI_Bcast(p_b, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
84 |     MPI_Bcast(p_n, 1, MPI_INT, 0, MPI_COMM_WORLD);
85 | }


--------------------------------------------------------------------------------
/mpi/06_mpi_trap4.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * File:        06_mpi_trap4.c
  3 |  * Purpose:     Use MPI to implement a parallel version of the trapezoidal rule.
  4 |  *              This version uses collective communications and 
  5 |  *              MPI derived datatypes to distribute the input data and
  6 |  *              compute the global sum.
  7 |  * Compile:     mpicc -Wall -o 06_mpi_trap4 06_mpi_trap4.c
  8 |  * Run:         mpiexec -n <number of proesses> ./06_mpi_trap4
  9 |  * 
 10 |  * Algorithm:
 11 |  *    1.  Each process calculates "its" interval of
 12 |  *        integration.
 13 |  *    2.  Each process estimates the integral of f(x)
 14 |  *        over its interval using the trapezoidal rule.
 15 |  *    3a. Each process != 0 sends its integral to 0.
 16 |  *    3b. Process 0 sums the calculations received from
 17 |  *        the individual processes and prints the result.
 18 |  */
 19 | #include <stdio.h>
 20 | #include <stdlib.h>
 21 | #include <mpi.h>
 22 | 
 23 | double Trap(double a, double b, int n, double h);
 24 | double f(double x);
 25 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n);
 26 | void Build_mpi_type(double* p_a, double* p_b, int* p_n, MPI_Datatype* p_input_mpi_t);
 27 | 
 28 | int main(void)
 29 | {
 30 |     int my_rank, comm_sz, n, local_n;
 31 |     double a, b, h, local_a, local_b, local_int, total_int;
 32 | 
 33 |     MPI_Init(NULL, NULL);
 34 |     MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
 35 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
 36 | 
 37 |     Get_input(my_rank, comm_sz, &a, &b, &n);
 38 | 
 39 |     h = (b-a)/n;
 40 |     local_n = n/comm_sz;
 41 | 
 42 |     local_a = a + my_rank*local_n*h;
 43 |     local_b = local_a + local_n*h;
 44 |     local_int = Trap(local_a, local_b, local_n, h);
 45 | 
 46 |     MPI_Reduce(&local_int, &total_int, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
 47 | 
 48 |     if (my_rank == 0) {
 49 |         printf("With n = %d trapezoids, our estimate\n", n);
 50 |         printf("of the integral from %f to %f = %.15f\n", a, b, total_int);
 51 |     }
 52 | 
 53 |     MPI_Finalize();
 54 | 
 55 |     return 0;
 56 | }
 57 | 
 58 | double Trap(double a, double b, int n, double h)
 59 | {
 60 |     double integral;
 61 |     
 62 |     integral = (f(a) + f(b)) / 2.0;
 63 | 
 64 |     for(int k = 0; k < n; k++) {
 65 |         integral += f(a + k*h);
 66 |     }
 67 |     integral = integral * h;
 68 | 
 69 |     return integral;
 70 | }
 71 | 
 72 | double f(double x)
 73 | {
 74 |     return x*x;
 75 | }
 76 | 
 77 | void Get_input(int my_rank, int comm_sz, double* p_a, double* p_b, int* p_n)
 78 | {
 79 |     MPI_Datatype input_mpi_t;
 80 | 
 81 |     Build_mpi_type(p_a, p_b, p_n, &input_mpi_t);
 82 | 
 83 |     if (my_rank == 0) {
 84 |         printf("Enter a, b, and n\n");
 85 |         scanf("%lf %lf %d", p_a, p_b, p_n);
 86 |     }
 87 | 
 88 |     MPI_Bcast(p_a, 1, input_mpi_t, 0, MPI_COMM_WORLD);
 89 | 
 90 |     MPI_Type_free(&input_mpi_t);
 91 | }
 92 | 
 93 | void Build_mpi_type(double* p_a, double* p_b, int* p_n, MPI_Datatype* p_input_mpi_t)
 94 | {
 95 |     int array_of_blocklengths[3] = {1, 1, 1};
 96 |     MPI_Datatype array_of_types[3] = {MPI_DOUBLE, MPI_DOUBLE, MPI_INT};
 97 |     MPI_Aint a_addr, b_addr, n_addr;
 98 |     MPI_Aint array_of_displacements[3] = {0};
 99 | 
100 |     MPI_Get_address(p_a, &a_addr);
101 |     MPI_Get_address(p_b, &b_addr);
102 |     MPI_Get_address(p_n, &n_addr);
103 | 
104 |     array_of_displacements[1] = b_addr - a_addr;
105 |     array_of_displacements[2] = n_addr - a_addr;
106 |     
107 |     MPI_Type_create_struct(3, array_of_blocklengths, array_of_displacements,
108 |                             array_of_types, p_input_mpi_t);
109 |     MPI_Type_commit(p_input_mpi_t);
110 | }


--------------------------------------------------------------------------------
/ocv_mat_mul.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        ocv_mat_mul.c
  3 |  * Purpose:     Compute a matrix-matrix product by using OpenCV library.
  4 |  * Compile:     g++ -Wall -o ocv_mat_mul ocv_mat_mul.c $(pkg-config opencv4 --libs --cflags)
  5 |  * Run:         ./ocv_mat_mul <m> <n> <k>
  6 |  *                  <m> : the rows of matrix A
  7 |  *                  <n> : the columns of matrix A and the rows of matrix B
  8 |  *                  <k> : the columns of matrix B
  9 |  * 
 10 |  * Input:       A, B
 11 |  * Output:      
 12 |  *              C: the product matrix, C = AB
 13 |  *              Elapsed time each multiplication and average elapsed time of
 14 |  *              100 multiplications
 15 |  *****************************************************************************/
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <opencv2/highgui.hpp>
 19 | #include <sys/time.h>
 20 | 
 21 | #define GET_TIME(now) { \
 22 |     struct timeval t; \
 23 |     gettimeofday(&t, NULL); \
 24 |     now = t.tv_sec + t.tv_usec/1000000.0; \
 25 | }
 26 | 
 27 | const int RMAX = 1000000;
 28 | #ifdef DEBUG
 29 | const int NCOUNT = 1; // number of multiplication
 30 | #else
 31 | const int NCOUNT = 100; // number of multiplication
 32 | #endif
 33 | 
 34 | void Get_args(int argc, char* argv[], int* m, int* n, int* k);
 35 | void Usage(char* prog_name);
 36 | void Generate_matrix(double mat[], int m, int n);
 37 | void Print_matrix(double mat[], int m, int n, char* title);
 38 | 
 39 | int main(int argc, char* argv[])
 40 | {
 41 |     int m, n, k;
 42 |     Get_args(argc, argv, &m, &n, &k);
 43 | 
 44 |     double *A, *B, *C;
 45 |     A = (double*)malloc(m * n * sizeof(double));
 46 |     B = (double*)malloc(n * k * sizeof(double));
 47 |     //C = (double*)malloc(m * k * sizeof(double));
 48 | 
 49 |     Generate_matrix(A, m, n);
 50 |     Generate_matrix(B, n, k);
 51 | #ifdef DEBUG
 52 |     Print_matrix(A, m, n, "A");
 53 |     Print_matrix(B, n, k, "B");
 54 | #endif
 55 | 
 56 | 
 57 |     double start, finish, avg_elapsed = 0.0;
 58 |     cv::Mat cvC;
 59 |     for (int count = 0; count < NCOUNT; count++) {
 60 |         GET_TIME(start);
 61 |         cv::Mat cvA(m, n, CV_64FC1, A);
 62 |         cv::Mat cvB(n, k, CV_64FC1, B);
 63 |         cvC = cvA * cvB;
 64 |         //cv::gemm(cvA, cvB, 1.0, NULL, 0, cvC);
 65 |         //C = reinterpret_cast<double*>(cvC.data);
 66 |         GET_TIME(finish);
 67 | 
 68 |         printf("[%3d] Elapsed time = %.6f seconds\n", count+1, finish-start);
 69 |         avg_elapsed += (finish - start) / NCOUNT;
 70 |     }
 71 |     
 72 | #ifdef DEBUG
 73 |     printf("The product is\n");
 74 |     cv::print(cvC);
 75 |     printf("\n\n");
 76 | #endif
 77 | 
 78 |     printf("Average elapsed time = %.6f seconds\n", avg_elapsed);
 79 | 
 80 |     free(A);
 81 |     free(B);
 82 |     //free(C);
 83 | 
 84 |     return 0;
 85 | }
 86 | 
 87 | /*****************************************************************************
 88 |  * Function:        Get_args
 89 |  * Purpose:         Get and check command list arguments
 90 |  * In args:         argc, argv
 91 |  * Out args:        m, n, k
 92 |  *****************************************************************************/
 93 | void Get_args(int argc, char* argv[], int* m, int* n, int* k)
 94 | {
 95 |     if (argc != 4)
 96 |         Usage(argv[0]);
 97 |     
 98 |     *m = strtol(argv[1], NULL, 10);
 99 |     *n = strtol(argv[2], NULL, 10);
100 |     *k = strtol(argv[3], NULL, 10);
101 |     if (*m <= 0 || *n <= 0 || *k <= 0)
102 |         Usage(argv[0]);
103 | }
104 | 
105 | /*****************************************************************************
106 |  * Function:        Usage
107 |  * Purpose:         Print a message indicating how program should be started
108 |  *                  and terminate.
109 |  * In arg:          prog_name
110 |  *****************************************************************************/
111 | void Usage(char* prog_name)
112 | {
113 |     fprintf(stderr, "Usage: %s <m> <n> <k>\n", prog_name);
114 |     exit(0);
115 | }
116 | 
117 | /*****************************************************************************
118 |  * Function:        Generate_matrix
119 |  * Purpose:         Generate matrix entries by using the random number generator
120 |  * In args:         m, n
121 |  * Out arg:         mat
122 |  *****************************************************************************/
123 | void Generate_matrix(double mat[], int m, int n)
124 | {
125 |     for (int i = 0; i < m; i++)
126 |         for (int j = 0; j < n; j++)
127 |             mat[i*n + j] = (rand() % RMAX) / (RMAX / 10.0);
128 | }
129 | 
130 | /*****************************************************************************
131 |  * Function:        Print_matrix
132 |  * Purpose:         Print the matrix
133 |  * In args:         mat, m, n, title
134 |  *****************************************************************************/
135 | void Print_matrix(double mat[], int m, int n, char* title)
136 | {
137 |     printf("%s\n", title);
138 |     for (int i = 0; i < m; i++) {
139 |         for (int j = 0; j < n; j++)
140 |             printf("%f ", mat[i*n + j]);
141 |         printf("\n");
142 |     }
143 | }


--------------------------------------------------------------------------------
/pthread/00_pth_hello.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        00_pth_hello.c
 3 |  * Purpose:     Illustrate basic use of threads: create some threads,
 4 |  *              each of which prints a mssage.
 5 |  * Compile:     gcc -Wall -o 00_pth_hello 00_pth_hello.c -pthread
 6 |  * Run:         ./00_pth_hello <thread_count>
 7 |  * 
 8 |  * Input:       none
 9 |  * Output:      message from each thread
10 |  *****************************************************************************/
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | #include <pthread.h>
14 | 
15 | const int MAX_THREADS = 64;
16 | 
17 | /* global variables: accesible to all threads */
18 | int thread_count;
19 | 
20 | void Usage(char* prog_name);
21 | void* Hello(void* rank);
22 | 
23 | int main(int argc, char* argv[])
24 | {
25 |     if (argc != 2) {
26 |         Usage(argv[0]);
27 |     }
28 | 
29 |     /* Get number of threads from command line */
30 |     thread_count = strtol(argv[1], NULL, 10);
31 |     if (thread_count <= 0 || thread_count > MAX_THREADS) {
32 |         Usage(argv[0]);
33 |     }
34 | 
35 |     pthread_t* thread_handles;
36 |     thread_handles = malloc(thread_count*sizeof(pthread_t));
37 | 
38 |     for (long thread = 0; thread < thread_count; thread++) {
39 |         pthread_create(&thread_handles[thread], NULL, Hello, (void*)thread);
40 |     }
41 | 
42 |     printf("Hello from the main thread\n");
43 | 
44 |     for (long thread = 0; thread < thread_count; thread++) {
45 |         pthread_join(thread_handles[thread], NULL);
46 |     }
47 | 
48 |     free(thread_handles);
49 | 
50 |     return 0;
51 | }
52 | 
53 | void Usage(char* prog_name)
54 | {
55 |     fprintf(stderr, "Usage %s <number of threads>\n", prog_name);
56 |     fprintf(stderr, "0 < number of threads <= %d\n", MAX_THREADS);
57 |     exit(0);
58 | }
59 | 
60 | void* Hello(void* rank)
61 | {
62 |     long my_rank = (long)rank;
63 | 
64 |     printf("Hello from thread %ld of %d\n", my_rank, thread_count);
65 | 
66 |     return NULL;
67 | }


--------------------------------------------------------------------------------
/pthread/02_pth_pi.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        02_pth_pi.c
  3 |  * Purpose:     Estimate pi using serise
  4 |  * 
  5 |  *                  pi = 4*[1 - 1/3 + 1/5 - 1/7 + 1/9 - . . .]
  6 |  *              
  7 |  *              This version has a 'very serious bug'
  8 |  * 
  9 |  * Compile:     gcc -Wall -o 02_pth_pi 02_pth_pi.c -pthread [-lm]
 10 |  * Run:         ./02_pth_pi <number of threads> <n>
 11 |  *              <n>:the number of terms of the Maclarin series. It should be
 12 |  *                  evenly divisible by the number of threads
 13 |  * 
 14 |  * Input:       none
 15 |  * Output:      The estimate of pi using multiple threads, one thread, and the
 16 |  *              value computed by the math library arctan function.
 17 |  *              Also elapsed times for the multithreaded and singlethreaded 
 18 |  *              computations.
 19 |  *****************************************************************************/
 20 | #include <stdio.h>
 21 | #include <stdlib.h>
 22 | #include <math.h>
 23 | #include <sys/time.h>
 24 | #include <pthread.h>
 25 | 
 26 | #define GET_TIME(now) { \
 27 |     struct timeval t; \
 28 |     gettimeofday(&t, NULL); \
 29 |     now = t.tv_sec + t.tv_usec/1000000.0; \
 30 | }
 31 | 
 32 | const int MAX_THREADS = 1024;
 33 | 
 34 | /* global variables */
 35 | long thread_count;
 36 | long long n;
 37 | double sum;
 38 | 
 39 | void *Thread_sum(void* rank);
 40 | 
 41 | void Get_args(int argc, char* argv[]);
 42 | double Serial_pi(long long n);
 43 | 
 44 | int main(int argc, char* argv[])
 45 | {
 46 |     pthread_t* thread_handles;
 47 | 
 48 |     Get_args(argc, argv);
 49 | 
 50 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
 51 |     sum = 0.0;
 52 | 
 53 |     for (long thread = 0; thread < thread_count; thread++)
 54 |         pthread_create(&thread_handles[thread], NULL, Thread_sum, (void*)thread);
 55 |     
 56 |     for (long thread = 0; thread < thread_count; thread++)
 57 |         pthread_join(thread_handles[thread], NULL);
 58 | 
 59 |     sum *= 4.0;
 60 | 
 61 |     printf("With n = %lld terms,\n", n);
 62 |     printf("   Multi-threaded estimate of pi  = %.15f\n", sum);
 63 | 
 64 |     sum = Serial_pi(n);
 65 |     printf("   Single-threaded estimate of pi = %.15f\n", sum);
 66 |     printf("   Math library estimate of pi    = %.15f\n", 4.0*atan(1.0));
 67 | 
 68 |     free(thread_handles);
 69 | 
 70 |     return 0;
 71 | }
 72 | 
 73 | /*****************************************************************************
 74 |  * Function:        Thread_sum
 75 |  * Purpose:         Add in the terms computed by the thread running this
 76 |  * In args:         rank
 77 |  * Return:          ignored(NULL)
 78 |  * Globals in:      n, thread_count
 79 |  * Global in/out:   sum
 80 |  *****************************************************************************/
 81 | void* Thread_sum(void* rank)
 82 | {
 83 |     long my_rank = (long)rank;
 84 |     long long my_n = n / thread_count;
 85 |     long long my_first_i = my_n * my_rank;
 86 |     long long my_last_i = my_first_i + my_n;
 87 | 
 88 |     double factor;
 89 |     if (my_first_i % 2 == 0)
 90 |         factor = 1.0;
 91 |     else
 92 |         factor = -1.0;
 93 | 
 94 |     for (long long i = my_first_i; i < my_last_i; i++, factor = -factor)
 95 |         sum += factor/(2*i + 1);
 96 | 
 97 |     return NULL;
 98 | }
 99 | 
100 | /*****************************************************************************
101 |  * Function:        Get_args
102 |  * Purpose:         Get and check command list arguments
103 |  * In args:         argc, argv
104 |  * Globals out:     thread_count, n
105 |  *****************************************************************************/
106 | void Get_args(int argc, char* argv[])
107 | {
108 |     int ok = 1;
109 |     if (argc == 3) {
110 |         thread_count = strtol(argv[1], NULL, 10);
111 |         if (thread_count < 0 || thread_count > MAX_THREADS)
112 |             ok = 0;
113 |         
114 |         n = strtoll(argv[2], NULL, 10);
115 |         if (n <= 0)
116 |             ok = 0;
117 |     }
118 |     else
119 |         ok = 0;
120 |     
121 |     if (ok == 0) {
122 |         fprintf(stderr, "Usage: %s <number of threads> <n>\n", argv[0]);
123 |         fprintf(stderr, "   n is the number of terms and should be >= 1\n");
124 |         fprintf(stderr, "   n should be evenly divisible by the number of threads\n");
125 |         exit(0);
126 |     }
127 | }
128 | 
129 | /*****************************************************************************
130 |  * Function:        Serial_pi
131 |  * Purpose:         Estimate pi using 1 thread
132 |  * In args:         n
133 |  * Return:          Estimate of pi using n terms of Maclaurin series
134 |  *****************************************************************************/
135 | double Serial_pi(long long n)
136 | {
137 |     double sum = 0.0;
138 |     double factor = 1.0;
139 | 
140 |     for (long long i = 0; i < n; i++, factor = -factor)
141 |         sum += factor / (2*i + 1);
142 |     
143 |     return 4.0 * sum;
144 | }


--------------------------------------------------------------------------------
/pthread/03_pth_pi_busy1.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        03_pth_pi_busy1.c
  3 |  * Purpose:     Estimate pi using serise
  4 |  * 
  5 |  *                  pi = 4*[1 - 1/3 + 1/5 - 1/7 + 1/9 - . . .]
  6 |  *              
  7 |  *              This version uses busy-waiting to control access to the
  8 |  *              critical section.
  9 |  * 
 10 |  * Compile:     gcc -Wall -o 03_pth_pi_busy1 03_pth_pi_busy1.c -pthread [-lm]
 11 |  * Run:         ./03_pth_pi_busy1 <number of threads> <n>
 12 |  *              <n>:the number of terms of the Maclarin series. It should be
 13 |  *                  evenly divisible by the number of threads
 14 |  * 
 15 |  * Input:       none
 16 |  * Output:      The estimate of pi using multiple threads, one thread, and the
 17 |  *              value computed by the math library arctan function.
 18 |  *              Also elapsed times for the multithreaded and singlethreaded 
 19 |  *              computations.
 20 |  *****************************************************************************/
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <math.h>
 24 | #include <sys/time.h>
 25 | #include <pthread.h>
 26 | 
 27 | #define GET_TIME(now) { \
 28 |     struct timeval t; \
 29 |     gettimeofday(&t, NULL); \
 30 |     now = t.tv_sec + t.tv_usec/1000000.0; \
 31 | }
 32 | 
 33 | const int MAX_THREADS = 1024;
 34 | 
 35 | /* global variables */
 36 | long thread_count;
 37 | long long n;
 38 | double sum;
 39 | int flag;
 40 | 
 41 | void *Thread_sum(void* rank);
 42 | 
 43 | void Get_args(int argc, char* argv[]);
 44 | double Serial_pi(long long n);
 45 | 
 46 | int main(int argc, char* argv[])
 47 | {
 48 |     pthread_t* thread_handles;
 49 | 
 50 |     Get_args(argc, argv);
 51 | 
 52 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
 53 | 
 54 |     double start, finish;
 55 |     GET_TIME(start);
 56 |     sum = 0.0;
 57 |     flag = 0;
 58 |     for (long thread = 0; thread < thread_count; thread++)
 59 |         pthread_create(&thread_handles[thread], NULL, Thread_sum, (void*)thread);
 60 |     
 61 |     for (long thread = 0; thread < thread_count; thread++)
 62 |         pthread_join(thread_handles[thread], NULL);
 63 |     sum *= 4.0;
 64 |     GET_TIME(finish);
 65 | 
 66 |     printf("With n = %lld terms,\n", n);
 67 |     printf("   Multi-threaded estimate of pi  = %.15f\n", sum);
 68 |     printf("   Elapsed time = %f seconds\n", finish-start);
 69 | 
 70 |     GET_TIME(start);
 71 |     sum = Serial_pi(n);
 72 |     GET_TIME(finish);
 73 |     printf("   Single-threaded estimate of pi = %.15f\n", sum);
 74 |     printf("   Elapsed time = %f seconds\n", finish-start);
 75 |     printf("   Math library estimate of pi    = %.15f\n", 4.0*atan(1.0));
 76 | 
 77 |     free(thread_handles);
 78 | 
 79 |     return 0;
 80 | }
 81 | 
 82 | /*****************************************************************************
 83 |  * Function:        Thread_sum
 84 |  * Purpose:         Add in the terms computed by the thread running this
 85 |  * In args:         rank
 86 |  * Return:          ignored(NULL)
 87 |  * Globals in:      n, thread_count
 88 |  * Global in/out:   sum
 89 |  *****************************************************************************/
 90 | void* Thread_sum(void* rank)
 91 | {
 92 |     long my_rank = (long)rank;
 93 |     long long my_n = n / thread_count;
 94 |     long long my_first_i = my_n * my_rank;
 95 |     long long my_last_i = my_first_i + my_n;
 96 | 
 97 |     double factor;
 98 |     if (my_first_i % 2 == 0)
 99 |         factor = 1.0;
100 |     else
101 |         factor = -1.0;
102 | 
103 |     for (long long i = my_first_i; i < my_last_i; i++, factor = -factor) {
104 |         while (flag != my_rank);
105 |         sum += factor/(2*i + 1);
106 |         flag = (flag + 1) % thread_count;
107 |     }
108 | 
109 |     return NULL;
110 | }
111 | 
112 | /*****************************************************************************
113 |  * Function:        Get_args
114 |  * Purpose:         Get and check command list arguments
115 |  * In args:         argc, argv
116 |  * Globals out:     thread_count, n
117 |  *****************************************************************************/
118 | void Get_args(int argc, char* argv[])
119 | {
120 |     int ok = 1;
121 |     if (argc == 3) {
122 |         thread_count = strtol(argv[1], NULL, 10);
123 |         if (thread_count < 0 || thread_count > MAX_THREADS)
124 |             ok = 0;
125 |         
126 |         n = strtoll(argv[2], NULL, 10);
127 |         if (n <= 0)
128 |             ok = 0;
129 |     }
130 |     else
131 |         ok = 0;
132 |     
133 |     if (ok == 0) {
134 |         fprintf(stderr, "Usage: %s <number of threads> <n>\n", argv[0]);
135 |         fprintf(stderr, "   n is the number of terms and should be >= 1\n");
136 |         fprintf(stderr, "   n should be evenly divisible by the number of threads\n");
137 |         exit(0);
138 |     }
139 | }
140 | 
141 | /*****************************************************************************
142 |  * Function:        Serial_pi
143 |  * Purpose:         Estimate pi using 1 thread
144 |  * In args:         n
145 |  * Return:          Estimate of pi using n terms of Maclaurin series
146 |  *****************************************************************************/
147 | double Serial_pi(long long n)
148 | {
149 |     double sum = 0.0;
150 |     double factor = 1.0;
151 | 
152 |     for (long long i = 0; i < n; i++, factor = -factor)
153 |         sum += factor / (2*i + 1);
154 |     
155 |     return 4.0 * sum;
156 | }


--------------------------------------------------------------------------------
/pthread/04_pth_pi_busy2.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        04_pth_pi_busy2.c
  3 |  * Purpose:     Estimate pi using serise
  4 |  * 
  5 |  *                  pi = 4*[1 - 1/3 + 1/5 - 1/7 + 1/9 - . . .]
  6 |  *              
  7 |  *              This is the second version that uses busy-waiting.
  8 |  *              The critical section now follows the main loop.
  9 |  * 
 10 |  * Compile:     gcc -Wall -o 04_pth_pi_busy2 04_pth_pi_busy2.c -pthread [-lm]
 11 |  * Run:         ./04_pth_pi_busy2 <number of threads> <n>
 12 |  *              <n>:the number of terms of the Maclarin series. It should be
 13 |  *                  evenly divisible by the number of threads
 14 |  * 
 15 |  * Input:       none
 16 |  * Output:      The estimate of pi using multiple threads, one thread, and the
 17 |  *              value computed by the math library arctan function.
 18 |  *              Also elapsed times for the multithreaded and singlethreaded 
 19 |  *              computations.
 20 |  *****************************************************************************/
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <math.h>
 24 | #include <sys/time.h>
 25 | #include <pthread.h>
 26 | 
 27 | #define GET_TIME(now) { \
 28 |     struct timeval t; \
 29 |     gettimeofday(&t, NULL); \
 30 |     now = t.tv_sec + t.tv_usec/1000000.0; \
 31 | }
 32 | 
 33 | const int MAX_THREADS = 1024;
 34 | 
 35 | /* global variables */
 36 | long thread_count;
 37 | long long n;
 38 | double sum;
 39 | int flag;
 40 | 
 41 | void *Thread_sum(void* rank);
 42 | 
 43 | void Get_args(int argc, char* argv[]);
 44 | double Serial_pi(long long n);
 45 | 
 46 | int main(int argc, char* argv[])
 47 | {
 48 |     pthread_t* thread_handles;
 49 | 
 50 |     Get_args(argc, argv);
 51 | 
 52 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
 53 | 
 54 |     double start, finish;
 55 |     GET_TIME(start);
 56 |     sum = 0.0;
 57 |     flag = 0;
 58 |     for (long thread = 0; thread < thread_count; thread++)
 59 |         pthread_create(&thread_handles[thread], NULL, Thread_sum, (void*)thread);
 60 |     
 61 |     for (long thread = 0; thread < thread_count; thread++)
 62 |         pthread_join(thread_handles[thread], NULL);
 63 |     sum *= 4.0;
 64 |     GET_TIME(finish);
 65 | 
 66 |     printf("With n = %lld terms,\n", n);
 67 |     printf("   Multi-threaded estimate of pi  = %.15f\n", sum);
 68 |     printf("   Elapsed time = %f seconds\n", finish-start);
 69 | 
 70 |     GET_TIME(start);
 71 |     sum = Serial_pi(n);
 72 |     GET_TIME(finish);
 73 |     printf("   Single-threaded estimate of pi = %.15f\n", sum);
 74 |     printf("   Elapsed time = %f seconds\n", finish-start);
 75 |     printf("   Math library estimate of pi    = %.15f\n", 4.0*atan(1.0));
 76 | 
 77 |     free(thread_handles);
 78 | 
 79 |     return 0;
 80 | }
 81 | 
 82 | /*****************************************************************************
 83 |  * Function:        Thread_sum
 84 |  * Purpose:         Add in the terms computed by the thread running this
 85 |  * In args:         rank
 86 |  * Return:          ignored(NULL)
 87 |  * Globals in:      n, thread_count
 88 |  * Global in/out:   sum
 89 |  *****************************************************************************/
 90 | void* Thread_sum(void* rank)
 91 | {
 92 |     long my_rank = (long)rank;
 93 |     long long my_n = n / thread_count;
 94 |     long long my_first_i = my_n * my_rank;
 95 |     long long my_last_i = my_first_i + my_n;
 96 | 
 97 |     double factor, my_sum = 0.0;
 98 |     if (my_first_i % 2 == 0)
 99 |         factor = 1.0;
100 |     else
101 |         factor = -1.0;
102 | 
103 |     for (long long i = my_first_i; i < my_last_i; i++, factor = -factor) {
104 |         my_sum += factor/(2*i + 1);
105 |     }
106 |     
107 |     while (flag != my_rank);
108 |     sum += my_sum;
109 |     flag = (flag + 1) % thread_count;
110 | 
111 |     return NULL;
112 | }
113 | 
114 | /*****************************************************************************
115 |  * Function:        Get_args
116 |  * Purpose:         Get and check command list arguments
117 |  * In args:         argc, argv
118 |  * Globals out:     thread_count, n
119 |  *****************************************************************************/
120 | void Get_args(int argc, char* argv[])
121 | {
122 |     int ok = 1;
123 |     if (argc == 3) {
124 |         thread_count = strtol(argv[1], NULL, 10);
125 |         if (thread_count < 0 || thread_count > MAX_THREADS)
126 |             ok = 0;
127 |         
128 |         n = strtoll(argv[2], NULL, 10);
129 |         if (n <= 0)
130 |             ok = 0;
131 |     }
132 |     else
133 |         ok = 0;
134 |     
135 |     if (ok == 0) {
136 |         fprintf(stderr, "Usage: %s <number of threads> <n>\n", argv[0]);
137 |         fprintf(stderr, "   n is the number of terms and should be >= 1\n");
138 |         fprintf(stderr, "   n should be evenly divisible by the number of threads\n");
139 |         exit(0);
140 |     }
141 | }
142 | 
143 | /*****************************************************************************
144 |  * Function:        Serial_pi
145 |  * Purpose:         Estimate pi using 1 thread
146 |  * In args:         n
147 |  * Return:          Estimate of pi using n terms of Maclaurin series
148 |  *****************************************************************************/
149 | double Serial_pi(long long n)
150 | {
151 |     double sum = 0.0;
152 |     double factor = 1.0;
153 | 
154 |     for (long long i = 0; i < n; i++, factor = -factor)
155 |         sum += factor / (2*i + 1);
156 |     
157 |     return 4.0 * sum;
158 | }


--------------------------------------------------------------------------------
/pthread/06_pth_message.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        06_pth_message.c
 3 |  * Purpose:     Illustrate a synchronization problem with pthreads:
 4 |  *              create some threads, each of which creates and sends it to
 5 |  *              another thread, by copying it into that thread's buffer.
 6 |  * 
 7 |  * Compile:     gcc -Wall -o 06_pth_message 06_pth_message.c -pthread [-lm]
 8 |  * Run:         ./06_pth_message <number of threads>
 9 |  * 
10 |  * Input:       none
11 |  * Output:      message from each thread
12 |  *****************************************************************************/
13 | #include <stdio.h>
14 | #include <stdlib.h>
15 | #include <pthread.h>
16 | 
17 | const int MAX_THREADS = 1024;
18 | const int MSG_MAX = 100;
19 | 
20 | /* Global variables */
21 | long thread_count;
22 | char** messages;
23 | 
24 | void* Send_message(void* rank); /* Thread function */
25 | 
26 | int main(int argc, char* argv[])
27 | {
28 |     pthread_t* thread_handles;
29 | 
30 |     if (argc != 2) {
31 |         fprintf(stderr, "Usage: %s <number of threads>\n", argv[0]);
32 |         exit(0);
33 |     }
34 | 
35 |     thread_count = strtol(argv[1], NULL, 10);
36 |     if (thread_count <= 0 || thread_count > MAX_THREADS) {
37 |         fprintf(stderr, "The number of threads should be > 0 and < %d\n", MAX_THREADS);
38 |         exit(0);
39 |     }
40 | 
41 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
42 |     messages = (char**)malloc(thread_count * sizeof(char*));
43 | 
44 |     for (long thread = 0; thread < thread_count; thread++)
45 |         messages[thread] = NULL;
46 |     for (long thread = 0; thread < thread_count; thread++)
47 |         pthread_create(&thread_handles[thread], NULL, Send_message, (void*)thread);
48 |     for (long thread = 0; thread < thread_count; thread++)
49 |         pthread_join(thread_handles[thread], NULL);
50 |     
51 |     for (long thread = 0; thread < thread_count; thread++)
52 |         free(messages[thread]);
53 |     free(messages);
54 | 
55 |     free(thread_handles);
56 | 
57 |     return 0;
58 | }
59 | 
60 | /*****************************************************************************
61 |  * Function:        Send_message
62 |  * Purpose:         Create a message and send it by copying it into
63 |  *                  global messages array. Receive a message and print it.
64 |  * In args:         rank
65 |  * Global in:       thread_count
66 |  * Global in/out:   messages
67 |  * Return:          ignored(NULL)
68 |  * Note:            The my_msg buffer is freed in main function
69 |  *****************************************************************************/
70 | void* Send_message(void* rank)
71 | {
72 |     long my_rank = (long)rank;
73 |     long dest = (my_rank + 1) % thread_count;
74 |     long src = (my_rank + thread_count - 1) % thread_count;
75 |     char* my_msg = (char*)malloc(MSG_MAX * sizeof(char));
76 | 
77 |     sprintf(my_msg, "Hello to %ld from %ld", dest, my_rank);
78 |     messages[dest] = my_msg;
79 | 
80 |     if (messages[my_rank] != NULL)
81 |         printf("Thread %ld > %s\n", my_rank, messages[my_rank]);
82 |     else
83 |         printf("Thread %ld > No message from %ld\n", my_rank, src);
84 | 
85 |     return NULL;
86 | }


--------------------------------------------------------------------------------
/pthread/07_pth_message_sem.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        07_pth_message_sem.c
 3 |  * Purpose:     Illustrate a synchronization problem with pthreads:
 4 |  *              create some threads, each of which creates and sends it to
 5 |  *              another thread, by copying it into that thread's buffer.
 6 |  *              This version uses semaphores to solve the synchronization problem
 7 |  * 
 8 |  * Compile:     gcc -Wall -o 07_pth_message_sem 07_pth_message_sem.c -pthread [-lm]
 9 |  * Run:         ./07_pth_message_sem <number of threads>
10 |  * 
11 |  * Input:       none
12 |  * Output:      message from each thread
13 |  *****************************************************************************/
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <pthread.h>
17 | #include <semaphore.h>
18 | 
19 | const int MAX_THREADS = 1024;
20 | const int MSG_MAX = 100;
21 | 
22 | /* Global variables */
23 | long  thread_count;
24 | char** messages;
25 | sem_t* semaphores;
26 | 
27 | void* Send_message(void* rank); /* Thread function */
28 | 
29 | int main(int argc, char* argv[])
30 | {
31 |     pthread_t* thread_handles;
32 | 
33 |     if (argc != 2) {
34 |         fprintf(stderr, "Usage: %s <number of threads>\n", argv[0]);
35 |         exit(0);
36 |     }
37 | 
38 |     thread_count = strtol(argv[1], NULL, 10);
39 |     if (thread_count <= 0 || thread_count > MAX_THREADS) {
40 |         fprintf(stderr, "The number of threads should be > 0 and < %d\n", MAX_THREADS);
41 |         exit(0);
42 |     }
43 | 
44 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
45 |     messages = (char**)malloc(thread_count * sizeof(char*));
46 |     semaphores = (sem_t*)malloc(thread_count * sizeof(sem_t));
47 | 
48 |     for (long thread = 0; thread < thread_count; thread++) {
49 |         messages[thread] = NULL;
50 |         sem_init(&semaphores[thread], 0, 0);
51 |     }
52 |     for (long thread = 0; thread < thread_count; thread++)
53 |         pthread_create(&thread_handles[thread], NULL, Send_message, (void*)thread);
54 |     for (long thread = 0; thread < thread_count; thread++)
55 |         pthread_join(thread_handles[thread], NULL);
56 |     
57 |     for (long thread = 0; thread < thread_count; thread++) {
58 |         free(messages[thread]);
59 |         sem_destroy(&semaphores[thread]);
60 |     }
61 |     free(messages);
62 |     free(semaphores);
63 |     free(thread_handles);
64 | 
65 |     return 0;
66 | }
67 | 
68 | /*****************************************************************************
69 |  * Function:        Send_message
70 |  * Purpose:         Create a message and send it by copying it into
71 |  *                  global messages array. Receive a message and print it.
72 |  * In args:         rank
73 |  * Global in:       thread_count
74 |  * Global in/out:   messages, semaphores
75 |  * Return:          ignored(NULL)
76 |  * Note:            The my_msg buffer is freed in main function
77 |  *****************************************************************************/
78 | void* Send_message(void* rank)
79 | {
80 |     long my_rank = (long)rank;
81 |     long dest = (my_rank + 1) % thread_count;
82 |     char* my_msg = (char*)malloc(MSG_MAX * sizeof(char));
83 | 
84 |     sprintf(my_msg, "Hello to %ld from %ld", dest, my_rank);
85 |     messages[dest] = my_msg;
86 |     sem_post(&semaphores[dest]); // increase semaphores[dest] by 1 -> 'unlock' the semaphore of dest
87 | 
88 |     sem_wait(&semaphores[my_rank]); // decrease semaphores[my_rank] by 1 and return -> wait for our semaphore to be unlocked
89 |     printf("Thread %ld > %s\n", my_rank, messages[my_rank]);
90 | 
91 |     return NULL;
92 | }


--------------------------------------------------------------------------------
/pthread/08_pth_busy_barrier.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        08_pth_busy_barrier.c
 3 |  * Purpose:     Use busy-waiting and mutex barriers to synchronize threads
 4 |  * 
 5 |  * Compile:     gcc -Wall -o 08_pth_busy_barrier 08_pth_busy_barrier.c -pthread
 6 |  *              [-DDEBUG]
 7 |  * Run:         ./08_pth_busy_barrier <number of threads>
 8 |  * 
 9 |  * Input:       none
10 |  * Output:      Time for BARRIER_COUNT barriers
11 |  * 
12 |  * Note:        Verbose output can be enabled with the compile flag -DDEBUG
13 |  *****************************************************************************/
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <pthread.h>
17 | #include <sys/time.h>
18 | 
19 | #define GET_TIME(now)                           \
20 |     {                                           \
21 |         struct timeval t;                       \
22 |         gettimeofday(&t, NULL);                 \
23 |         now = t.tv_sec + t.tv_usec / 1000000.0; \
24 |     }
25 | 
26 | #define BARRIER_COUNT 100
27 | 
28 | /* Global variables */
29 | long thread_count;
30 | int barrier_thread_counts[BARRIER_COUNT];
31 | pthread_mutex_t barrier_mutex;
32 | 
33 | void* Thread_work(void* rank);
34 | 
35 | int main(int argc, char* argv[])
36 | {
37 |     pthread_t* thread_handles;
38 | 
39 |     if (argc != 2) {
40 |         fprintf(stderr, "Usage: %s <number of threads>\n", argv[0]);
41 |         exit(0);
42 |     }
43 | 
44 |     thread_count = strtol(argv[1], NULL, 10);
45 |     if (thread_count <= 0) {
46 |         fprintf(stderr, "The number of threads should be > 0\n");
47 |         exit(0);
48 |     }
49 | 
50 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
51 |     for (int i = 0; i < BARRIER_COUNT; i++)
52 |         barrier_thread_counts[i] = 0;
53 |     pthread_mutex_init(&barrier_mutex, NULL);
54 | 
55 |     double start, finish;
56 |     GET_TIME(start);
57 |     for (long thread = 0; thread < thread_count; thread++)
58 |         pthread_create(&thread_handles[thread], NULL, Thread_work, (void*)thread);
59 |     for (long thread = 0; thread < thread_count; thread++)
60 |         pthread_join(thread_handles[thread], NULL);
61 |     GET_TIME(finish);
62 | 
63 |     printf("Elapsed time = %f seconds\n", finish - start);
64 | 
65 |     pthread_mutex_destroy(&barrier_mutex);
66 |     free(thread_handles);
67 | 
68 |     return 0;
69 | }
70 | 
71 | /*****************************************************************************
72 |  * Function:        Thread_work
73 |  * Purpose:         Run BARRIER_COUNT barriers
74 |  * In args:         rank
75 |  * Global var:      thread_count, barrier_thread_counts, barrier_mutex
76 |  * Return:          ignored(NULL)
77 |  *****************************************************************************/
78 | void* Thread_work(void* rank)
79 | {
80 | #ifdef DEBUG
81 |     long my_rank = (long)rank;
82 | #endif
83 | 
84 |     for (int i = 0; i < BARRIER_COUNT; i++) {
85 |         pthread_mutex_lock(&barrier_mutex);
86 |         barrier_thread_counts[i]++;
87 |         pthread_mutex_unlock(&barrier_mutex);
88 |         while (barrier_thread_counts[i] < thread_count);
89 | #ifdef DEBUG
90 |         if (my_rank == 0) {
91 |             printf("All threads completed barrier %d\n", i);
92 |             fflush(stdout);
93 |         }
94 | #endif
95 |     }
96 | 
97 |     return NULL;
98 | }


--------------------------------------------------------------------------------
/pthread/09_pth_sem_barrier.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        09_pth_sem_barrier.c
  3 |  * Purpose:     Use semaphore barriers to synchronize threads
  4 |  * 
  5 |  * Compile:     gcc -Wall -o 09_pth_sem_barrier 09_pth_sem_barrier.c -pthread
  6 |  *              [-DDEBUG]
  7 |  * Run:         ./09_pth_sem_barrier <number of threads>
  8 |  * 
  9 |  * Input:       none
 10 |  * Output:      Time for BARRIER_COUNT barriers
 11 |  * 
 12 |  * Note:        Verbose output can be enabled with the compile flag -DDEBUG
 13 |  *****************************************************************************/
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <pthread.h>
 17 | #include <semaphore.h>
 18 | #include <sys/time.h>
 19 | 
 20 | #define GET_TIME(now)                           \
 21 |     {                                           \
 22 |         struct timeval t;                       \
 23 |         gettimeofday(&t, NULL);                 \
 24 |         now = t.tv_sec + t.tv_usec / 1000000.0; \
 25 |     }
 26 | 
 27 | #define BARRIER_COUNT 100
 28 | 
 29 | /* Global variables */
 30 | long thread_count;
 31 | long counter;
 32 | sem_t barrier_sems[BARRIER_COUNT];
 33 | sem_t count_sem;
 34 | 
 35 | void* Thread_work(void* rank);
 36 | 
 37 | int main(int argc, char* argv[])
 38 | {
 39 |     pthread_t* thread_handles;
 40 | 
 41 |     if (argc != 2) {
 42 |         fprintf(stderr, "Usage: %s <number of threads>\n", argv[0]);
 43 |         exit(0);
 44 |     }
 45 | 
 46 |     thread_count = strtol(argv[1], NULL, 10);
 47 |     if (thread_count <= 0) {
 48 |         fprintf(stderr, "The number of threads should be > 0\n");
 49 |         exit(0);
 50 |     }
 51 | 
 52 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
 53 |     for (int i = 0; i < BARRIER_COUNT; i++)
 54 |         sem_init(&barrier_sems[i], 0, 0);
 55 |     sem_init(&count_sem, 0, 1);
 56 | 
 57 |     double start, finish;
 58 |     GET_TIME(start);
 59 |     for (long thread = 0; thread < thread_count; thread++)
 60 |         pthread_create(&thread_handles[thread], NULL, Thread_work, (void*)thread);
 61 |     for (long thread = 0; thread < thread_count; thread++)
 62 |         pthread_join(thread_handles[thread], NULL);
 63 |     GET_TIME(finish);
 64 | 
 65 |     printf("Elapsed time = %f seconds\n", finish - start);
 66 | 
 67 |     sem_destroy(&count_sem);
 68 |     for (int i = 0; i < BARRIER_COUNT; i++)
 69 |         sem_destroy(&barrier_sems[i]);
 70 |     free(thread_handles);
 71 | 
 72 |     return 0;
 73 | }
 74 | 
 75 | /*****************************************************************************
 76 |  * Function:        Thread_work
 77 |  * Purpose:         Run BARRIER_COUNT barriers
 78 |  * In args:         rank
 79 |  * Global var:      thread_count, count, barrier_sems, count_sem
 80 |  * Return:          ignored(NULL)
 81 |  *****************************************************************************/
 82 | void* Thread_work(void* rank)
 83 | {
 84 | #ifdef DEBUG
 85 |     long my_rank = (long)rank;
 86 | #endif
 87 | 
 88 |     for (int i = 0; i < BARRIER_COUNT; i++) {
 89 |         sem_wait(&count_sem);
 90 |         if (counter == thread_count - 1) {
 91 |             counter = 0;
 92 |             sem_post(&count_sem);
 93 |             for (int j = 0; j < thread_count - 1; j++)
 94 |                 sem_post(&barrier_sems[i]);
 95 |         }
 96 |         else {
 97 |             counter++;
 98 |             sem_post(&count_sem);
 99 |             sem_wait(&barrier_sems[i]);
100 |         }
101 | #ifdef DEBUG
102 |         if (my_rank == 0) {
103 |             printf("All threads completed barrier %d\n", i);
104 |             fflush(stdout);
105 |         }
106 | #endif
107 |     }
108 | 
109 |     return NULL;
110 | }


--------------------------------------------------------------------------------
/pthread/10_pth_cond_barrier.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        10_pth_cond_barrier.c
  3 |  * Purpose:     Use condition wait barriers to synchronize threads
  4 |  * 
  5 |  * Compile:     gcc -Wall -o 10_pth_cond_barrier 10_pth_cond_barrier.c -pthread
  6 |  *              [-DDEBUG]
  7 |  * Run:         ./10_pth_cond_barrier <number of threads>
  8 |  * 
  9 |  * Input:       none
 10 |  * Output:      Time for BARRIER_COUNT barriers
 11 |  * 
 12 |  * Note:        Verbose output can be enabled with the compile flag -DDEBUG
 13 |  *****************************************************************************/
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <pthread.h>
 17 | #include <sys/time.h>
 18 | 
 19 | #define GET_TIME(now)                           \
 20 |     {                                           \
 21 |         struct timeval t;                       \
 22 |         gettimeofday(&t, NULL);                 \
 23 |         now = t.tv_sec + t.tv_usec / 1000000.0; \
 24 |     }
 25 | //#define DEBUG
 26 | #define BARRIER_COUNT 100
 27 | 
 28 | /* Global variables */
 29 | long thread_count, barrier_thread_count;
 30 | pthread_mutex_t barrier_mutex;
 31 | pthread_cond_t ok_to_proceed;
 32 | 
 33 | void* Thread_work(void* rank);
 34 | 
 35 | int main(int argc, char* argv[])
 36 | {
 37 |     pthread_t* thread_handles;
 38 | 
 39 |     if (argc != 2) {
 40 |         fprintf(stderr, "Usage: %s <number of threads>\n", argv[0]);
 41 |         exit(0);
 42 |     }
 43 | 
 44 |     thread_count = strtol(argv[1], NULL, 10);
 45 |     if (thread_count <= 0) {
 46 |         fprintf(stderr, "The number of threads should be > 0\n");
 47 |         exit(0);
 48 |     }
 49 | 
 50 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
 51 |     pthread_mutex_init(&barrier_mutex, NULL);
 52 |     pthread_cond_init(&ok_to_proceed, NULL);
 53 | 
 54 |     double start, finish;
 55 |     GET_TIME(start);
 56 |     for (long thread = 0; thread < thread_count; thread++)
 57 |         pthread_create(&thread_handles[thread], NULL, Thread_work, (void*)thread);
 58 |     for (long thread = 0; thread < thread_count; thread++)
 59 |         pthread_join(thread_handles[thread], NULL);
 60 |     GET_TIME(finish);
 61 | 
 62 |     printf("Elapsed time = %f seconds\n", finish - start);
 63 | 
 64 |     pthread_mutex_destroy(&barrier_mutex);
 65 |     pthread_cond_destroy(&ok_to_proceed);
 66 |     free(thread_handles);
 67 | 
 68 |     return 0;
 69 | }
 70 | 
 71 | /*****************************************************************************
 72 |  * Function:        Thread_work
 73 |  * Purpose:         Run BARRIER_COUNT barriers
 74 |  * In args:         rank
 75 |  * Global var:      thread_count, barrier_thread_count, barrier_mutex
 76 |  * Return:          ignored(NULL)
 77 |  *****************************************************************************/
 78 | void* Thread_work(void* rank)
 79 | {
 80 | #ifdef DEBUG
 81 |     long my_rank = (long)rank;
 82 | #endif
 83 | 
 84 |     for (int i = 0; i < BARRIER_COUNT; i++) {
 85 |         pthread_mutex_lock(&barrier_mutex);
 86 |         barrier_thread_count++;
 87 | 
 88 |         if (barrier_thread_count == thread_count) {
 89 |             barrier_thread_count = 0;
 90 | #ifdef DEBUG
 91 |             printf("Thread %ld > Signalling other threads in barrier %d\n", my_rank, i);
 92 |             fflush(stdout);
 93 | #endif
 94 |             pthread_cond_broadcast(&ok_to_proceed);
 95 |         }
 96 |         else {
 97 |             // Wait unlocks mutex and puts thread to sleep.
 98 |             //    Put wait in while loop in case some other
 99 |             // event awakens thread.
100 |             while (pthread_cond_wait(&ok_to_proceed, &barrier_mutex) != 0);
101 |             // Mutex is relocked at this point.
102 | #ifdef DEBUG
103 |             printf("Thread %ld > Awakened in barrier %d\n", my_rank, i);
104 | #endif
105 |         }
106 |         pthread_mutex_unlock(&barrier_mutex);
107 | #ifdef DEBUG
108 |         if (my_rank == 0) {
109 |             printf("All threads completed barrier %d\n", i);
110 |             fflush(stdout);
111 |         }
112 | #endif
113 |     }
114 | 
115 |     return NULL;
116 | }


--------------------------------------------------------------------------------
/pthread/11_pth_posix_barrier.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************
 2 |  * File:        11_pth_posix_barrier.c
 3 |  * Purpose:     Use POSIX barrier to synchronize threads
 4 |  * 
 5 |  * Compile:     gcc -Wall -o 11_pth_posix_barrier 11_pth_posix_barrier.c -pthread
 6 |  *              [-DDEBUG]
 7 |  * Run:         ./11_pth_posix_barrier <number of threads>
 8 |  * 
 9 |  * Input:       none
10 |  * Output:      Time for BARRIER_COUNT barriers
11 |  * 
12 |  * Note:        Verbose output can be enabled with the compile flag -DDEBUG
13 |  *****************************************************************************/
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <pthread.h>
17 | #include <sys/time.h>
18 | 
19 | #define GET_TIME(now)                           \
20 |     {                                           \
21 |         struct timeval t;                       \
22 |         gettimeofday(&t, NULL);                 \
23 |         now = t.tv_sec + t.tv_usec / 1000000.0; \
24 |     }
25 | 
26 | #define BARRIER_COUNT 100
27 | 
28 | /* Global variables */
29 | long thread_count;
30 | pthread_barrier_t barrier;
31 | 
32 | void* Thread_work(void* rank);
33 | 
34 | int main(int argc, char* argv[])
35 | {
36 |     pthread_t* thread_handles;
37 | 
38 |     if (argc != 2) {
39 |         fprintf(stderr, "Usage: %s <number of threads>\n", argv[0]);
40 |         exit(0);
41 |     }
42 | 
43 |     thread_count = strtol(argv[1], NULL, 10);
44 |     if (thread_count <= 0) {
45 |         fprintf(stderr, "The number of threads should be > 0\n");
46 |         exit(0);
47 |     }
48 | 
49 |     thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
50 |     pthread_barrier_init(&barrier, NULL, thread_count);
51 | 
52 |     double start, finish;
53 |     GET_TIME(start);
54 |     for (long thread = 0; thread < thread_count; thread++)
55 |         pthread_create(&thread_handles[thread], NULL, Thread_work, (void*)thread);
56 |     for (long thread = 0; thread < thread_count; thread++)
57 |         pthread_join(thread_handles[thread], NULL);
58 |     GET_TIME(finish);
59 | 
60 |     printf("Elapsed time = %f seconds\n", finish - start);
61 | 
62 |     pthread_barrier_destroy(&barrier);
63 |     free(thread_handles);
64 | 
65 |     return 0;
66 | }
67 | 
68 | /*****************************************************************************
69 |  * Function:        Thread_work
70 |  * Purpose:         Run BARRIER_COUNT barriers
71 |  * In args:         rank
72 |  * Global var:      thread_count, barrier
73 |  * Return:          ignored(NULL)
74 |  *****************************************************************************/
75 | void* Thread_work(void* rank)
76 | {
77 | #ifdef DEBUG
78 |     long my_rank = (long)rank;
79 | #endif
80 | 
81 |     for (int i = 0; i < BARRIER_COUNT; i++) {
82 |         pthread_barrier_wait(&barrier);
83 | #ifdef DEBUG
84 |         if (my_rank == 0) {
85 |             printf("All threads completed barrier %d\n", i);
86 |             fflush(stdout);
87 |         }
88 | #endif
89 |     }
90 | 
91 |     return NULL;
92 | }


--------------------------------------------------------------------------------
/pthread/12_pth_tokenize.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * File:        12_pth_tokenize.c
  3 |  * Purpose:     Try to use threads to tokenize text input. Illustrate problems
  4 |  *              with function that isn't thread-safe.
  5 |  * 
  6 |  *              * This program deinitely has problems.
  7 |  * 
  8 |  * Compile:     gcc -Wall -o 12_pth_tokenize 12_pth_tokenize.c -pthread
  9 |  * Run:         ./12_pth_tokenize <number of threads>
 10 |  * 
 11 |  * Input:       Lines of text
 12 |  * Output:      For each line of input:
 13 |  *                the line read by the program, and the tokens identified by
 14 |  *                strtok
 15 |  *****************************************************************************/
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <string.h>
 19 | #include <pthread.h>
 20 | #include <semaphore.h>
 21 | 
 22 | const int MAX = 1000;
 23 | 
 24 | long thread_count;
 25 | sem_t* sems;
 26 | 
 27 | void Usage(char* prog_name);
 28 | void *Tokenize(void* rank); /* thread function */
 29 | 
 30 | int main(int argc, char* argv[])
 31 | {
 32 |     if (argc != 2)
 33 |         Usage(argv[0]);
 34 |     thread_count = atoi(argv[1]);
 35 |     
 36 |     pthread_t* thread_handles = (pthread_t*)malloc(thread_count * sizeof(pthread_t));
 37 |     sems = (sem_t*)malloc(thread_count * sizeof(sem_t));
 38 |     // sems[0] should be unlocked, the others should be locked
 39 |     sem_init(&sems[0], 0, 1);
 40 |     for (long thread = 1; thread < thread_count; thread++)
 41 |         sem_init(&sems[thread], 0, 0);
 42 |     
 43 |     printf("Enter text\n");
 44 |     for (long thread = 0; thread < thread_count; thread++)
 45 |         pthread_create(&thread_handles[thread], NULL, Tokenize, (void*)thread);
 46 |     
 47 |     for (long thread = 0; thread < thread_count; thread++)
 48 |         pthread_join(thread_handles[thread], NULL);
 49 |     
 50 |     for (long thread = 0; thread < thread_count; thread++)
 51 |         sem_destroy(&sems[thread]);
 52 |     
 53 |     free(sems);
 54 |     free(thread_handles);
 55 |     return 0;
 56 | }
 57 | 
 58 | /*****************************************************************************
 59 |  * Function:        Usage
 60 |  * Purpose:         Print command line for function and terminate
 61 |  * In args:         prog_name
 62 |  *****************************************************************************/
 63 | void Usage(char* prog_name)
 64 | {
 65 |     fprintf(stderr, "Usage: %s <number of threads>\n", prog_name);
 66 |     exit(0);
 67 | }
 68 | 
 69 | /*****************************************************************************
 70 |  * Function:        Tokenize
 71 |  * Purpose:         Tokenize lines of input
 72 |  * In args:         rank
 73 |  * Global var:      thread_count, sems
 74 |  * Return:          ignored(NULL)
 75 |  *****************************************************************************/
 76 | void* Tokenize(void* rank)
 77 | {
 78 |     long my_rank = (long)rank;
 79 |     int count;
 80 |     int next = (my_rank + 1) % thread_count;
 81 |     char* fg_rv;
 82 |     char my_line[MAX];
 83 |     char* my_string;
 84 | 
 85 |     /* Force sequential reading of the input */
 86 |     sem_wait(&sems[my_rank]);
 87 |     fg_rv = fgets(my_line, MAX, stdin);
 88 |     sem_post(&sems[next]);
 89 | 
 90 |     while (fg_rv != NULL) {
 91 |         printf("Thread %ld > my_line = %s", my_rank, my_line);
 92 | 
 93 |         count = 0;
 94 |         my_string = strtok(my_line, " \t\n");
 95 |         while (my_string != NULL) {
 96 |             count++;
 97 |             printf("Thread %ld > string %d = %s\n", my_rank, count, my_string);
 98 |             my_string = strtok(NULL, " \t\n");
 99 |         }
100 | 
101 |         //if (my_line != NULL)
102 |             //printf("Thread %ld > After tokenizing, my_line = %s\n", my_rank, my_line);
103 |         
104 |         sem_wait(&sems[my_rank]);
105 |         fg_rv = fgets(my_line, MAX, stdin);
106 |         sem_post(&sems[next]);
107 |     }
108 | 
109 |     return NULL;
110 | }


--------------------------------------------------------------------------------